代理IP的使用

###蘑菇代理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests


def get_page():
url = "蘑菇代理的api"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
# print(response.status_code)
if response.status_code == 200:
return response.json()


def get_proxies():
ip_json = get_page()
print(ip_json)
ip = ip_json['msg'][0]['ip']
port = ip_json['msg'][0]['port']
proxy = ip + ':' + port

proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}

return proxies


def main():
print(get_proxies())

if __name__ == '__main__':
main()

###数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pymysql


# 获取数据库连接
def get_db_con():
host = '127.0.0.1'
port = 3306
user = 'root'
password = '123456'
database = 'douban'
con = pymysql.connect(host, user, password, database, charset='utf8', port=port)
return con


# 获取数据库游标
def get_cursor(con):
return con.cursor()


# 关闭连接
def close_con(con):
con.close()


# 执行插入语句
def insert_douban(one_movie_dict, con, cursor):
sql = "insert into douban_tb (title, content, href, source, d_time, d_like) values ('%s', '%s', '%s','%s', '%s', '%s')" % (
one_movie_dict['title'], one_movie_dict['content'], one_movie_dict['href'], one_movie_dict['source'], one_movie_dict['d_time'],
one_movie_dict['d_like'])
print(sql)
cursor.execute(sql)
con.commit()


def main():
con = get_db_con()
cursor = get_cursor(con)
one_movie_dict = {}
one_movie_dict['title'] = '我有一门看上去高大上,实际没有什么用的技能'
one_movie_dict['content'] = '时隔多年,终于更新了。今天给各位鹅带来羽绒服的豆腐教学,完成后的效果图如下: 想看其它教学的,移步楼主豆列: 废话不多说,直接进入正题: 第一步,将羽绒服平铺捋平,然后从底部取..'
one_movie_dict['href'] = 'https://www.douban.com/group/topic/130929608/'
one_movie_dict['source'] = '生活组小组'
one_movie_dict['d_time'] = '2019-1-8'
one_movie_dict['d_like'] = '13'
insert_movie(one_movie_dict, con, cursor)
close_con(con)


if __name__ == '__main__':
main()

###豆瓣使用代理(解决爬取数据被封ip的)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import requests
from bs4 import BeautifulSoup
import proxy_helper
import time
import db_helper
from lxml import etree

con = db_helper.get_db_con()
cursor = db_helper.get_cursor(con)


def get_page(page, proxies):
url = "https://www.douban.com/group/explore?start=%d" % page
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}

response = None
# 判断是否使用代理
if proxies is None:
response = requests.get(url, headers=headers, timeout=10)
else:
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)

print(response.status_code)
if response.status_code == 200:
return response.text
return None


def parse_page(html):
result_list = []
etree_html = etree.HTML(html)
items = etree_html.xpath('//div[@class="channel-item"]')
try:
for item in items:
result_dict = {}
# 标题
title = item.xpath('./div[@class="bd"]/h3/a/text()')[0]
# 简介
p = item.xpath('./div[@class="bd"]/div/p/text()')[0]
# 来源
source = item.xpath('./div[@class="bd"]/div/span[@class="from"]/a/text()')[0]
# 发布时间
pubtime = item.xpath('./div[@class="bd"]/div/span[@class="pubtime"]/text()')[0]
# 多少人喜欢
like = item.xpath('./div[@class="likes"]/text()')[0]
# print(like)
# 详细页
href = item.xpath('./div[@class="bd"]/h3/a/@href')[0]
result_dict['title'] = title
result_dict['href'] = href
result_dict['content'] = p
result_dict['source'] = source
result_dict['d_time'] = pubtime
result_dict['d_like'] = like
db_helper.insert_douban(result_dict, con, cursor)
# print(result_dict)
result_list.append(result_dict)
print(result_list)
except Exception as e:
pass
return result_list


if __name__ == '__main__':
proxies = None
page = 0
while(True):
print('**********%d******' % (page + 1))
if page >= 307:
break
try:
html = get_page(page * 30, proxies)
except Exception as e:
html = None

if html is None:
# 可能被封,使用代理
time.sleep(2)
proxies = proxy_helper.get_proxies()
continue

# print(html)
parse_page(html)
page += 1
print('成功结束。。。。')