上传文件至 /
This commit is contained in:
26
data_clean.py
Normal file
26
data_clean.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import json
|
||||
|
||||
# 读取原始爬虫数据
|
||||
with open("movies.json", "r", encoding="utf-8") as f:
|
||||
raw_movies = json.load(f)
|
||||
|
||||
# 过滤quote为空的样本
|
||||
valid_movies = [movie for movie in raw_movies if movie["quote"].strip()]
|
||||
|
||||
# 1. 生成LabelStudio导入文件 quotes_processed.txt
|
||||
with open("quotes_processed.txt", "w", encoding="utf-8") as out_f:
|
||||
for item in valid_movies:
|
||||
line = json.dumps({"text": item["quote"]}, ensure_ascii=False)
|
||||
out_f.write(line + "\n")
|
||||
|
||||
# 2. 生成数据统计文件 process_stats.json
|
||||
stats = {
|
||||
"原始总样本": len(raw_movies),
|
||||
"过滤后有效样本": len(valid_movies),
|
||||
"过滤掉空短评样本": len(raw_movies) - len(valid_movies)
|
||||
}
|
||||
with open("process_stats.json", "w", encoding="utf-8") as f:
|
||||
json.dump(stats, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("数据清洗完成,已生成 quotes_processed.txt、process_stats.json")
|
||||
print(f"统计信息:{stats}")
|
||||
Reference in New Issue
Block a user