from sklearn.metrics import cohen_kappa_score annotations_a = ['positive', 'negative', 'positive', 'positive', 'negative'] * 20 annotations_b = ['positive', 'negative', 'positive', 'positive', 'negative'] * 20 kappa = cohen_kappa_score(annotations_a, annotations_b) print(f"Kappa = {kappa:.3f}") # 两人完全一致 → Kappa = 1.000 # 不同结果示例 annotations_a = ['positive'] * 60 + ['negative'] * 40 annotations_b = ['positive'] * 55 + ['negative'] * 45 kappa = cohen_kappa_score(annotations_a, annotations_b) print(f"Kappa = {kappa:.3f}") # Kappa ≈ 0.69 def ner_f1(entities_a, entities_b): """entities 是 [(text, type, start, end), ...] 列表""" set_a = set(entities_a) set_b = set(entities_b) tp = len(set_a & set_b) # 都标了 fp = len(set_a - set_b) # A 标了 B 没标 fn = len(set_b - set_a) # B 标了 A 没标 precision = tp / (tp + fp) if (tp + fp) > 0 else 0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0 f1 = 2 * precision * recall / (precision + recall) \ if (precision + recall) > 0 else 0 return precision, recall, f1 # 例 a = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7)] b = [("张伟", "PER", 0, 2), ("清华", "ORG", 5, 7), ("北京", "LOC", 10, 12)] p, r, f1 = ner_f1(a, b) print(f"P={p:.2f} R={r:.2f} F1={f1:.2f}") # P=1.00 R=0.67 F1=0.80 from statsmodels.stats.inter_rater import fleiss_kappa # 每个任务每个类别有多少人标 # 例: 5个任务, 3个类别, 4人标注 counts = [ [4, 0, 0], # 任务1: 4人都标类别0 [0, 4, 0], # 任务2: 4人都标类别1 [2, 2, 0], # 任务3: 2人标0, 2人标1 [1, 1, 2], # 任务4: 各有分歧 [0, 0, 4], # 任务5: 4人都标类别2 ] kappa = fleiss_kappa(counts) print(f"Fleiss Kappa = {kappa:.3f}")