Evaluation
Evall log
{"files":{"EXIST2025_T1_1_es_hard_test_gold.json":{"description":"The file is correctly parser without errors or warnings.\\nFile name: EXIST2025_T1_1_es_hard_test_gold.json.","errors":{},"gold":true,"name":"EXIST2025_T1_1_es_hard_test_gold.json","status":"OK"},"preds_es_run_mini_mini_qwen_newprompt_with_summary.json":{"description":"The file is correctly parser without errors or warnings.\\nFile name: preds_es_run_mini_mini_qwen_newprompt_with_summary.json.","errors":{},"gold":false,"name":"preds_es_run_mini_mini_qwen_newprompt_with_summary.json","status":"OK"}},"metrics":{"Accuracy":{"acronym":"Acc","description":"Coming soon!","name":"Accuracy","results":{"average_per_test_case":0.8802889576883385,"test_cases":[{"average":0.8802889576883385,"name":"EXIST2025"}]},"status":"OK"},"FMeasure":{"acronym":"F1","description":"Coming soon!","name":"F-Measure","results":{"average_per_test_case":0.8253254781484742,"test_cases":[{"average":0.8253254781484742,"classes":{"NO":0.8265402843601896,"YES":0.8241106719367589},"name":"EXIST2025"}]},"status":"OK"},"ICM":{"acronym":"ICM","description":"Coming soon!","name":"Information Contrast model","results":{"average_per_test_case":0.6405705525591995,"test_cases":[{"average":0.6405705525591995,"name":"EXIST2025"}]},"status":"OK"},"ICMNorm":{"acronym":"ICM-Norm","description":"Coming soon!","name":"Normalized Information Contrast Model","results":{"average_per_test_case":0.8203268663913766,"test_cases":[{"average":0.8203268663913766,"name":"EXIST2025"}]},"status":"OK"},"Kappa":{"acronym":"Kappa","description":"Coming soon!","name":"Cohen's Kappa","results":{"average_per_test_case":0.7604950161724033,"test_cases":[{"average":0.7604950161724033,"name":"EXIST2025"}]},"status":"OK"},"Precision":{"acronym":"Pr","description":"Coming soon!","name":"Precision","results":{"average_per_test_case":0.7769742608972827,"test_cases":[{"average":0.7769742608972827,"classes":{"NO":0.7730496453900709,"YES":0.7808988764044944},"name":"EXIST2025"}]},"status":"OK"},"Recall":{"acronym":"Re","description":"Coming soon!","name":"Recall","results":{"average_per_test_case":0.8801843219797356,"test_cases":[{"average":0.8801843219797356,"classes":{"NO":0.8879837067209776,"YES":0.8723849372384938},"name":"EXIST2025"}]},"status":"OK"},"SystemPrecision":{"acronym":"SP","description":"Coming soon!","name":"System Precision","results":{"average_per_test_case":0.7768670309653917,"test_cases":[{"average":0.7768670309653917,"name":"EXIST2025"}]},"status":"OK"}}}
ICM Norm
0.82
metric_params
{"FMeasure":{"alfa_param":0.5,"custom":false},"ICM":{"alpha_1":2,"alpha_2":2,"beta":3,"custom":false}}
Sistema
Protegi_GPT5-mini&Qwen-2.5
Partición resultados
All
Precisión
0.78
Recall
0.88
F1
0.83
Accuracy
0.88
ICM
0.64
SystemPrecision
0.78
Kappa
0.76

