Files
2026-04-23 16:02:05 +08:00

236 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 安装jieba
import subprocess
subprocess.run(['pip', 'install', 'jieba', '-q'])
print("jieba安装完成")
import jieba
print("=" * 50)
print("jieba分词演示")
print("=" * 50)
text = "我喜欢深度学习和人工智能"
print(f"原文: {text}")
print()
# 精确模式(默认)
words精确 = list(jieba.cut(text, cut_all=False))
print(f"精确模式: {' / '.join(words精确)}")
# 全模式
words全 = list(jieba.cut(text, cut_all=True))
print(f"全模式: {' / '.join(words全)}")
# 搜索引擎模式
words搜索 = list(jieba.cut_for_search(text))
print(f"搜索模式: {' / '.join(words搜索)}")
# 更多分词示例
import jieba
print("=" * 50)
print("更多分词示例")
print("=" * 50)
examples = [
"今天天气真不错",
"人工智能是未来的发展方向",
"Python是一门非常流行的编程语言",
"小明毕业于清华大学计算机系",
"我今天在京东买了一部iPhone手机"
]
for i, text in enumerate(examples):
words = list(jieba.cut(text))
print(f"{i+1}. {text}")
print(f"{' / '.join(words)}")
print()
import jieba.posseg as pseg
print("=" * 50)
print("jieba词性标注演示")
print("=" * 50)
text = "我喜欢深度学习和人工智能"
print(f"原文: {text}")
print()
words = pseg.cut(text)
print("分词 + 词性标注:")
for word, flag in words:
print(f" {word}: {flag}")
import jieba
print("=" * 50)
print("停用词处理演示")
print("=" * 50)
# 常见停用词列表
stopwords = set(['', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '', '', '', '', '', '没有', '', '', '自己', ''])
text = "人工智能是未来的发展方向,也是当前科技领域的热门话题"
print(f"原文: {text}")
print()
# 不使用停用词
words_all = list(jieba.cut(text))
print(f"不使用停用词: {' / '.join(words_all)}")
# 使用停用词
words_filtered = [w for w in words_all if w not in stopwords]
print(f"使用停用词: {' / '.join(words_filtered)}")
print()
# 更完整的停用词表可以从网上下载
print("提示:实际项目中可以从以下地方获取停用词表:")
print(" - 哈工大停用词表")
print(" - 百度停用词表")
print(" - 四川大学机器学习实验室停用词表")
# 实战:完整的文本预处理流程
import jieba
print("=" * 50)
print("完整的文本预处理流程")
print("=" * 50)
# 示例文档集合
docs = [
"今天天气真不错!适合出去玩。",
"Python是一门很棒的编程语言。",
"人工智能和机器学习是未来的发展方向。",
"今天在咖啡馆喝了一杯很好喝的拿铁。"
]
# 停用词表
stopwords = set(['', '', '', '', '', '', '', '', '', '', '', '', '一个', '', '', '', '', '', '', '', '', '', '', '没有', '', '', '自己', '', '', '', ','])
def preprocess_text(text):
"""完整的文本预处理流程"""
# 1. 分词
words = jieba.cut(text)
# 2. 去除停用词
words = [w for w in words if w not in stopwords and len(w) > 0]
# 3. 去除空格
words = [w for w in words if w.strip()]
return words
print("预处理结果:")
for i, doc in enumerate(docs):
words = preprocess_text(doc)
print(f"\nDoc{i+1}: {doc}")
print(f"{' / '.join(words)}")
# 实战jieba分词 + TF-IDF完整流程
import jieba
import math
print("=" * 50)
print("实战jieba分词 + TF-IDF完整流程")
print("=" * 50)
def simple_tfidf_tokenized(docs, stopwords=None):
"""
结合分词的TF-IDF实现
参数:
docs: 原始文档列表
stopwords: 停用词集合
返回:
vocab, tfidf_matrix
"""
# 1. 分词
tokenized = []
for doc in docs:
words = jieba.cut(doc)
if stopwords:
words = [w for w in words if w not in stopwords and len(w) > 1]
else:
words = [w for w in words if len(w) > 1]
tokenized.append(words)
# 2. 构建词表
vocab_set = set()
for doc in tokenized:
vocab_set.update(doc)
vocab = sorted(list(vocab_set))
# 3. 构建TF矩阵并计算IDF
n_docs = len(tokenized)
tf_matrix = []
df_dict = {word: 0 for word in vocab}
for doc in tokenized:
vec = [0] * len(vocab)
for word in doc:
if word in vocab:
idx = vocab.index(word)
vec[idx] += 1
tf_matrix.append(vec)
# 计算DF
for vec in tf_matrix:
for j, count in enumerate(vec):
if count > 0:
word = vocab[j]
df_dict[word] += 1
# 计算IDF
idf = []
for word in vocab:
df = df_dict[word]
idf_j = math.log(n_docs / (df + 1)) + 1
idf.append(idf_j)
# 计算TF-IDF
tfidf = []
for vec in tf_matrix:
tfidf_vec = [vec[i] * idf[i] for i in range(len(vec))]
tfidf.append(tfidf_vec)
return vocab, tfidf, tokenized
# 示例文档集合
docs = [
"Python是一门很棒的编程语言",
"人工智能是未来的发展方向",
"深度学习是机器学习的一个分支",
"Python和Java都是很流行的编程语言"
]
# 停用词
stopwords = set(["", "", "一个", "", "", "", ""])
vocab, tfidf_matrix, tokenized = simple_tfidf_tokenized(docs, stopwords)
print("文档集合:")
for i, doc in enumerate(docs):
print(f" Doc{i+1}: {doc}")
print()
print(f"分词结果:")
for i, words in enumerate(tokenized):
print(f" Doc{i+1}: {' / '.join(words)}")
print()
print(f"词表(共{len(vocab)}个词):")
print(f" {vocab}")
print()
print("TF-IDF矩阵")
for i, vec in enumerate(tfidf_matrix):
# 只显示非零值
nonzero = [(vocab[j], round(vec[j], 4)) for j in range(len(vec)) if vec[j] > 0]
print(f" Doc{i+1}: {nonzero}")
print()
# 找每个文档最重要的词
print("每个文档最重要的词TF-IDF值最高")
for i, vec in enumerate(tfidf_matrix):
max_idx = max(range(len(vec)), key=lambda j: vec[j])
max_score = vec[max_idx]
if max_score > 0:
print(f" Doc{i+1}: '{vocab[max_idx]}' (TF-IDF={max_score:.4f})")