🏆 FM4SE Leaderboard: Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks

The SE Arena is an open-source platform designed to evaluate foundation models through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading FMs in related tasks. For technical details, check out our paper.

{
  • "headers": [
    • "Rank",
    • "Model",
    • "Elo Score",
    • "Conversation Efficiency Index",
    • "Model Consistency Score",
    • "Average Win Rate",
    • "Bradley-Terry Coefficient",
    • "Eigenvector Centrality Value",
    • "Newman Modularity Score",
    • "PageRank Score"
    ],
  • "data": [
    • [
      • 1,
      • "claude-3-5-sonnet-20241022",
      • 1005.97,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 1,
      • null,
      • 0.08
      ],
    • [
      • 2,
      • "grok-3-fast-beta",
      • 1003.99,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.08
      ],
    • [
      • 3,
      • "llama-3.1-405b",
      • 1003.95,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.08
      ],
    • [
      • 4,
      • "gemini-2.0-flash-lite-preview",
      • 1002,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.07
      ],
    • [
      • 5,
      • "o3-mini",
      • 1001.99,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.05
      ],
    • [
      • 6,
      • "mistral-large-latest",
      • 1001.98,
      • 0.72,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.05
      ],
    • [
      • 7,
      • "deepseek-chat",
      • 1001.94,
      • 1,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.05
      ],
    • [
      • 8,
      • "claude-3-7-sonnet-20250219#thinking",
      • 999.99,
      • 0.33,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • null,
      • 0.05
      ],
    • [
      • 8,
      • "claude-3-opus-20240229",
      • 999.99,
      • 0.53,
      • "N/A",
      • 0.5,
      • 0,
      • 0,
      • 0,
      • 0.04
      ],
    • [
      • 10,
      • "gpt-4-turbo",
      • 999.98,
      • -0.33,
      • "N/A",
      • 0.5,
      • 0,
      • 0,
      • 0,
      • 0.04
      ],
    • [
      • 11,
      • "gemma-3-27b-it",
      • 998.02,
      • 0.3,
      • "N/A",
      • 0.33,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 11,
      • "gpt-4o-mini",
      • 998.02,
      • -1,
      • "N/A",
      • 0.12,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 13,
      • "gemini-1.5-flash",
      • 998.01,
      • -0.6,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 13,
      • "claude-3-haiku-20240307",
      • 998.01,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 13,
      • "o4-mini",
      • 998.01,
      • 0.3,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "o1-mini",
      • 998,
      • 0.3,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "deepseek-r1",
      • 998,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "gemini-2.0-pro-exp",
      • 998,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "o1",
      • 998,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 16,
      • "gpt-4o",
      • 998,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 21,
      • "grok-3-beta",
      • 997.99,
      • 0.3,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 22,
      • "gpt-4.1-mini",
      • 996.01,
      • -1,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 22,
      • "claude-3-7-sonnet-latest",
      • 996.01,
      • -0.13,
      • "N/A",
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 24,
      • "gemini-1.5-pro",
      • 996,
      • "N/A",
      • 1,
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ],
    • [
      • 25,
      • "gpt-3.5-turbo",
      • 988.14,
      • -0.63,
      • 0.33,
      • 0,
      • 0,
      • 0,
      • 0,
      • 0.03
      ]
    ],
  • "metadata": null
}

Made with ❤️ for SE Arena. If this work is useful to you, please consider citing:

@inproceedings{zhao2025se,
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering},
author={Zhao, Zhimin},
booktitle={ACM international conference on AI Foundation Models and Software Engineering},
year={2025}}