###代码主页代码如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152import json
import re
import requests
from day01_maoyan import db_helper
# 创建连接
db_con = db_helper.get_db_con()
cursor = db_helper.get_cursor(db_con)
def save_json(result_list):
result_json = json.dumps(result_list, ensure_ascii=False)
with open('maoyan.json', 'w', encoding='utf-8') as f:
f.write(result_json)
# 数据清洗
def strips(l):
result_list = []
for item in l:
result_list.append(item.strip())
return result_list
# 获取单个网页response.text(返回字符串)
def get_page(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
# 获取图片response.content(返回二进制)
def get_pic(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content
return None
# 取评分
def get_score(score_html):
pattern = re.compile('<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>')
score = re.findall(pattern, score_html)
return score
# 解析网页
def parse_page(html):
result_list = []
# 获取主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern, html)
actors = strips(actors)
# 获取电影名
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
films = re.findall(pattern, html)
# 清洗数据
films = strips(films)
# print(films)
# 获取上映时间
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
time = re.findall(pattern, html)
time = strips(time)
# print(time)
# 获取评分
pattern = re.compile('<p class="score">(.*?)</p>', re.S)
score_ps = re.findall(pattern, html)
score_list = []
for score_html in score_ps:
scores = get_score(score_html)
score = ''.join(scores[0])
score_list.append(score)
# print(score_list)
# 获取排名
pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
board = re.findall(pattern, html)
# print(board)
# 获取电影详细信息链接
pattern = re.compile('<dd>.*?<a href="(.*?)" title=".*?" .*?>.*?</a>', re.S)
href = re.findall(pattern, html)
# print(href)
pattern = re.compile('<img data-src="(.*?)" .*?>', re.S)
img = re.findall(pattern, html)
for i in img:
save_pic(i)
# print(img)
for i in range(len(films)):
item = {}
item['title'] = films[i]
item['actor'] = actors[i]
item['time'] = time[i]
item['score'] = score_list[i]
item['rank'] = board[i]
item['img'] = img[i]
item['href'] = href[i]
# 插入数据库
db_helper.insert_movie(item, db_con, cursor)
result_list.append(item)
return result_list
# 获取全部网页
def get_all_page():
result_all_list = []
for i in range(10):
page = i*10
# print(page)
url = 'http://maoyan.com/board/4?offset=%d' % page
# 获取网页
html = get_page(url)
# print(html)
# 解析网页
result_list = parse_page(html)
result_all_list.extend(result_list)
return result_all_list
# 保存图片response.content(返回二进制)
def save_pic(url):
img_content = get_pic(url)
file_name = url.split('/')[-1].split('@')[0]
print(file_name)
with open('./images/%s' % file_name, 'wb') as f:
f.write(img_content)
def main():
# url = 'http://maoyan.com/board/4'
# # 获取网页
#
# html = get_page(url)
# # print(html)
# # 解析网页
# result_list = parse_page(html)
result_list = get_all_page()
print(len(result_list))
save_json(result_list)
db_helper.close_con(db_con)
if __name__ == '__main__':
main()
###数据库连接,及其保存(db_helper.py)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45import pymysql
# 获取数据库连接
def get_db_con():
host = '127.0.0.1'
port = 3306
user = 'root'
password = '123456'
database = 'test2'
con = pymysql.connect(host, user, password, database, charset='utf8', port=port)
return con
# 获取数据库游标
def get_cursor(con):
return con.cursor()
# 关闭连接
def close_con(con):
con.close()
# 执行插入语句
def insert_movie(one_movie_dict, con, cursor):
sql = "insert into maoyan_movie (title, actor, release_time) values ('%s', '%s', '%s')" % (one_movie_dict['title'], one_movie_dict['actor'], one_movie_dict['time'])
print(sql)
cursor.execute(sql)
con.commit()
def main():
con = get_db_con()
cursor = get_cursor(con)
one_movie_dict = {}
one_movie_dict['title'] = '霸王别姬'
one_movie_dict['actor'] = '巩俐'
one_movie_dict['release_time'] = '2019-01-03'
insert_movie(one_movie_dict, con, cursor)
close_con(con)
if __name__ == '__main__':
main()