import json # 读取原始爬虫数据 with open("movies.json", "r", encoding="utf-8") as f: raw_movies = json.load(f) # 过滤quote为空的样本 valid_movies = [movie for movie in raw_movies if movie["quote"].strip()] # 1. 生成LabelStudio导入文件 quotes_processed.txt with open("quotes_processed.txt", "w", encoding="utf-8") as out_f: for item in valid_movies: line = json.dumps({"text": item["quote"]}, ensure_ascii=False) out_f.write(line + "\n") # 2. 生成数据统计文件 process_stats.json stats = { "原始总样本": len(raw_movies), "过滤后有效样本": len(valid_movies), "过滤掉空短评样本": len(raw_movies) - len(valid_movies) } with open("process_stats.json", "w", encoding="utf-8") as f: json.dump(stats, f, ensure_ascii=False, indent=2) print("数据清洗完成,已生成 quotes_processed.txt、process_stats.json") print(f"统计信息:{stats}")