from sklearn.metrics import cohen_kappa_score

annotations_a = ['positive', 'negative', 'positive', 'positive',
                 'negative'] * 20
annotations_b = ['positive', 'negative', 'positive', 'positive',
                 'negative'] * 20

kappa = cohen_kappa_score(annotations_a, annotations_b)
print(f"Kappa = {kappa:.3f}")
# 两人完全一致 → Kappa = 1.000
# 不同结果示例
annotations_a = ['positive'] * 60 + ['negative'] * 40
annotations_b = ['positive'] * 55 + ['negative'] * 45

kappa = cohen_kappa_score(annotations_a, annotations_b)
print(f"Kappa = {kappa:.3f}")
# Kappa ≈ 0.69
def ner_f1(entities_a, entities_b):
    """entities 是 [(text, type, start, end), ...] 列表"""
    set_a = set(entities_a)
    set_b = set(entities_b)

    tp = len(set_a & set_b)  # 都标了
    fp = len(set_a - set_b)  # A 标了 B 没标
    fn = len(set_b - set_a)  # B 标了 A 没标

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) \
        if (precision + recall) > 0 else 0

    return precision, recall, f1


# 例
a = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7)]
b = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7), ("北京", "LOC", 10, 12)]

p, r, f1 = ner_f1(a, b)
print(f"P={p:.2f} R={r:.2f} F1={f1:.2f}")
# P=1.00 R=0.67 F1=0.80
from statsmodels.stats.inter_rater import fleiss_kappa

# 每个任务每个类别有多少人标
# 例: 5个任务, 3个类别, 4人标注
counts = [
    [4, 0, 0],   # 任务1: 4人都标类别0
    [0, 4, 0],   # 任务2: 4人都标类别1
    [2, 2, 0],   # 任务3: 2人标0, 2人标1
    [1, 1, 2],   # 任务4: 各有分歧
    [0, 0, 4],   # 任务5: 4人都标类别2
]

kappa = fleiss_kappa(counts)
print(f"Fleiss Kappa = {kappa:.3f}")