Files
task-2-4-regular-expression/260403-2509165039.py
2026-04-03 11:17:04 +08:00

112 lines
2.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#练习 1提取天气预报
text = '''
2024-03-15 天气:晴 温度15-25°C
2024-03-16 天气:多云 温度12-20°C
2024-03-17 天气:小雨 温度10-18°C
'''
import re
pattern = r'(\d{4}-\d{2}-\d{2})\s*天气:([^ ]+)\s*温度:(\d+)-(\d+)°C'
matches = re.findall(pattern, text)
for match in matches:
date, weather, low, high = match
print(f'{date}: {weather}, {low}°C-{high}°C')
# 练习 2爬取豆瓣电影信息
import re
html = '''
<div class="movie">
<h2 class="name">《流浪地球》</h2>
<span class="year">(2024)</span>
<span class="rating">8.5</span>
<span class="director">导演:郭帆</span>
</div>
<div class="movie">
<h2 class="name">《你好,李焕英》</h2>
<span class="year">(2024)</span>
<span class="rating">7.9</span>
<span class="director">导演:贾玲</span>
</div>
'''
name_pattern = r'<h2 class="name">《([^》]+)》</h2>'
year_pattern = r'<span class="year">\((\d{4})\)</span>'
rating_pattern = r'<span class="rating">([^<]+)</span>'
director_pattern = r'导演:([^<]+)'
names = re.findall(name_pattern, html)
years = re.findall(year_pattern, html)
ratings = re.findall(rating_pattern, html)
directors = re.findall(director_pattern, html)
for i in range(len(names)):
print(f"{names[i]} | {years[i]} | 评分:{ratings[i]} | {directors[i]}")
# 练习 3日志分析
import re
log = '''
192.168.1.100 - - [15/Mar/2024:10:15:30 +0800] "GET /index.html HTTP/1.1" 200 1234
10.0.0.50 - - [15/Mar/2024:10:15:31 +0800] "POST /api/login HTTP/1.1" 200 256
192.168.1.101 - - [15/Mar/2024:10:15:32 +0800] "GET /notfound.html HTTP/1.1" 404 512
172.16.0.200 - - [15/Mar/2024:10:15:33 +0800] "GET /images/logo.png HTTP/1.1" 200 4096
'''
pattern = r'(\d+\.\d+\.\d+\.\d+).*?\[([^\]]+)\].*?" (\d{3}) \d+'
for match in re.finditer(pattern, log):
ip, time, status = match.groups()
print(f'IP: {ip:15} | 时间: {time:25} | 状态: {status}')
# 练习 4电话号码脱敏
import re
phone_book = '''
张三138-1234-5678
李四139-5678-1234
王五138-0000-1111
'''
pattern = r'(\d{3})-(\d{4})-(\d{4})'
def mask_phone(match):
return f'{match.group(1)}-****-{match.group(3)}'
masked = re.sub(pattern, mask_phone, phone_book)
print(masked)
# 练习 5综合挑战
import re
html = '''
<table class="schedule">
<tr><th>时间</th><th>课程</th><th>教室</th></tr>
<tr><td>周一 1-2节</td><td>Python程序设计</td><td>A101</td></tr>
<tr><td>周一 3-4节</td><td>数据结构</td><td>B205</td></tr>
<tr><td>周二 1-2节</td><td>高等数学</td><td>C301</td></tr>
<tr><td>周三 5-6节</td><td>Python程序设计</td><td>A102</td></tr>
</table>
'''
time_pattern = r'<td>([^<]+)</td><td>([^<]+)</td><td>([^<]+)</td>'
courses = re.findall(time_pattern, html)
print('课程表:')
for time, course, room in courses:
print(f'{time} | {course} | {room}')