55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
from sklearn.metrics import cohen_kappa_score
|
|
|
|
annotations_a = ['positive', 'negative', 'positive', 'positive',
|
|
'negative'] * 20
|
|
annotations_b = ['positive', 'negative', 'positive', 'positive',
|
|
'negative'] * 20
|
|
|
|
kappa = cohen_kappa_score(annotations_a, annotations_b)
|
|
print(f"Kappa = {kappa:.3f}")
|
|
# 两人完全一致 → Kappa = 1.000
|
|
# 不同结果示例
|
|
annotations_a = ['positive'] * 60 + ['negative'] * 40
|
|
annotations_b = ['positive'] * 55 + ['negative'] * 45
|
|
|
|
kappa = cohen_kappa_score(annotations_a, annotations_b)
|
|
print(f"Kappa = {kappa:.3f}")
|
|
# Kappa ≈ 0.69
|
|
def ner_f1(entities_a, entities_b):
|
|
"""entities 是 [(text, type, start, end), ...] 列表"""
|
|
set_a = set(entities_a)
|
|
set_b = set(entities_b)
|
|
|
|
tp = len(set_a & set_b) # 都标了
|
|
fp = len(set_a - set_b) # A 标了 B 没标
|
|
fn = len(set_b - set_a) # B 标了 A 没标
|
|
|
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
|
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
|
f1 = 2 * precision * recall / (precision + recall) \
|
|
if (precision + recall) > 0 else 0
|
|
|
|
return precision, recall, f1
|
|
|
|
|
|
# 例
|
|
a = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7)]
|
|
b = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7), ("北京", "LOC", 10, 12)]
|
|
|
|
p, r, f1 = ner_f1(a, b)
|
|
print(f"P={p:.2f} R={r:.2f} F1={f1:.2f}")
|
|
# P=1.00 R=0.67 F1=0.80
|
|
from statsmodels.stats.inter_rater import fleiss_kappa
|
|
|
|
# 每个任务每个类别有多少人标
|
|
# 例: 5个任务, 3个类别, 4人标注
|
|
counts = [
|
|
[4, 0, 0], # 任务1: 4人都标类别0
|
|
[0, 4, 0], # 任务2: 4人都标类别1
|
|
[2, 2, 0], # 任务3: 2人标0, 2人标1
|
|
[1, 1, 2], # 任务4: 各有分歧
|
|
[0, 0, 4], # 任务5: 4人都标类别2
|
|
]
|
|
|
|
kappa = fleiss_kappa(counts)
|
|
print(f"Fleiss Kappa = {kappa:.3f}") |