上传文件至 /

2026-06-09 11:18:50 +08:00
parent 8d04e62a3e
commit 6eccc27f2e
5 changed files with 450 additions and 0 deletions
--- a/data_clean.py
+++ b/data_clean.py
@@ -0,0 +1,26 @@
+import json
+
+# 读取原始爬虫数据
+with open("movies.json", "r", encoding="utf-8") as f:
+    raw_movies = json.load(f)
+
+# 过滤quote为空的样本
+valid_movies = [movie for movie in raw_movies if movie["quote"].strip()]
+
+# 1. 生成LabelStudio导入文件 quotes_processed.txt
+with open("quotes_processed.txt", "w", encoding="utf-8") as out_f:
+    for item in valid_movies:
+        line = json.dumps({"text": item["quote"]}, ensure_ascii=False)
+        out_f.write(line + "\n")
+
+# 2. 生成数据统计文件 process_stats.json
+stats = {
+    "原始总样本": len(raw_movies),
+    "过滤后有效样本": len(valid_movies),
+    "过滤掉空短评样本": len(raw_movies) - len(valid_movies)
+}
+with open("process_stats.json", "w", encoding="utf-8") as f:
+    json.dump(stats, f, ensure_ascii=False, indent=2)
+
+print("数据清洗完成，已生成 quotes_processed.txt、process_stats.json")
+print(f"统计信息：{stats}")