selenium自动化爬取京东和虾米音乐,网易云音乐

###用selenium音乐

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models import Jingdong

# 保存数据库
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/jd?charset=utf8", max_overflow=5)
session_maker = sessionmaker(bind=engine)
session = session_maker()

# 打开一个浏览器
browser = webdriver.Chrome()
# 设置等待时间
wait = WebDriverWait(browser, 5)


def get_page(page):
if page == 1:
url = 'https://www.jd.com/'
# 访问网址
browser.get(url)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
input.clear()
input.send_keys('女士夏装')
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search button.button')))
button.click()
# 睡眠时间
time.sleep(5)

# 滑动到页面底部
str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, scrollHeight);'
browser.execute_script(str_js)
time.sleep(2)
# for i in range(16):
# i = i+1
# str_js = 'var scrollHeight = document.body.scrollHeight / 16;window.scrollTo(0, scrollHeight * (%d));' % i
# browser.execute_script(str_js)
# time.sleep(1)
# 向上翻页
for i in range(16, 0, -1):
str_js = 'var scrollHeight = document.body.scrollHeight / 16;window.scrollTo(0, scrollHeight * (%d));' % i
browser.execute_script(str_js)
time.sleep(4)
# 保存当前页的内容
page_source = browser.page_source

# 向上滚动页面
input = browser.find_element_by_css_selector('#J_bottomPage input.input-txt')
str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, %d);' % input.location['y']
browser.execute_script(str_js)
time.sleep(4)
# 输入页面码
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
input.clear()
input.send_keys(page+1)
# 点击图片
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
submit.click()
return page_source


def parse_page(html):
etree_html = etree.HTML(html)
gl_items = etree_html.xpath('//div[@id="J_goodsList"]//li[@class="gl-item"]')
print(len(gl_items))
for gl_item in gl_items:
img_src = ''.join(gl_item.xpath('.//div[@class="p-img"]/a/img/@src'))
title = ''.join(gl_item.xpath('.//div[@class="p-name p-name-type-2"]//em//text()'))
price = ''.join(gl_item.xpath('.//div[@class="p-price"]//strong//text()'))
shop = ''.join(gl_item.xpath('.//div[@class="p-shop"]//span/a/text()'))
commit = ''.join(gl_item.xpath('.//div[@class="p-commit"]//strong//text()'))
icon = ''.join(gl_item.xpath('.//div[@class="p-icons"]/i/text()'))
jd = Jingdong()
jd.title = title
jd.shop = shop
jd.price = price
jd.img = img_src
jd.comment = commit
jd.icon = icon
# 保存数据库
try:
session.add(jd)
session.commit()
except Exception as e:
pass


def main():
for page in range(100):
page = page + 1
print(page)
html = get_page(page)
parse_page(html)
# browser.close()


if __name__ == '__main__':
main()

###kaisha解密文件(kaisha.py)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import urllib
from urllib import parse

def str2url(s):
#s = '9hFaF2FF%_Et%m4F4%538t2i%795E%3pF.265E85.%fnF9742Em33e162_36pA.t6661983%x%6%%74%2i2%22735'
num_loc = s.find('h')
rows = int(s[0:num_loc])
strlen = len(s) - num_loc
cols = int(strlen/rows)
right_rows = strlen % rows
new_s = list(s[num_loc:])
output = ''
for i in range(len(new_s)):
x = i % rows
y = i / rows
p = 0
if x <= right_rows:
p = x * (cols + 1) + y
else:
p = right_rows * (cols + 1) + (x - right_rows) * cols + y
output += new_s[int(p)]
return parse.unquote(output).replace('^', '0')


def main():
s = "6hAFxn752E5F215234uy495-3741E8t%mie15F2E185E%6at%72E%7ba%13t21at27261734458%h3%%-5a885d5pF2m%%11799662E13_D55%E992E48%%8i222%4%59358.Fk1EE5-8bc%7632..FF%5%24_%9_mae58%E513e51"
result_str = str2url(s)
print(result_str)

main()

###自动化爬取虾米音乐

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import time

import requests
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote
from lxml import etree
import kaisha

# 打开一个浏览器
browser = webdriver.Chrome()
# 设置等待时间
wait = WebDriverWait(browser, 5)


def get_resourse(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content
return None


def get_page():
url = 'https://www.xiami.com/'
# 访问网址
browser.get(url)
# 点击事件(方法一)
button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '回旧版')))
button.click()
# 点击事件(方法二)
button = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@id = "secondary"]//div[@class="nav"]/a[2]')))
button.click()
# 点击欧美
button = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@id = "tab"]/span[3]/a')))
button.click()
# 等待加载完成,可以等待某个元素出现
# wait.until(EC.text_to_be_present_in_element((By.XPATH, '//tr[@data-index="99"]/td[@class="trackid"]/text()'), '100'))
time.sleep(3)
# 拿网页
return browser.page_source


def parse_page(html):
etree_html = etree.HTML(html)

song_list = etree_html.xpath('//tr[@class="songwrapper"]')
# print(mp3_lis3)
for song in song_list:
# 取歌曲链接
mp3_data = song.xpath('./@data-mp3')[0]
# 用喀什密码解密
mp3_url = kaisha.str2url(mp3_data)
# 取歌曲名
mp3_title = song.xpath('./@data-title')[0]
mp3_title = mp3_title.replace(' ', '')
print(mp3_title)
print(mp3_url)
# save_mp3(mp3_url, mp3_title)
save_us(mp3_url, mp3_title)


def save_mp3(mp3_url, mp3_title):
content = get_resourse(mp3_url)
with open('./mp3/%s.mp3' % mp3_title, 'wb') as f:
f.write(content)


# 保存英文歌
def save_us(mp3_url, mp3_title):
content = get_resourse(mp3_url)
with open('./us/%s.mp3' % mp3_title, 'wb') as f:
f.write(content)


def main():
html = get_page()
parse_page(html)


if __name__ == '__main__':
main()

###爬取网易云音乐榜单

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# coding=utf-8
'''
爬取网易云音乐榜单
'''
# 导入需要使用的模块
import os
import csv
import time
import random
import requests
import threading
from lxml import etree
from selenium import webdriver

agents = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
]


class WangyiMusc(object):

# id表示你要获取排行榜
# mating是榜单名称
# url为排行榜页面
# music_url为歌曲下载页面
def __init__(self, mating, id):
self.browser = webdriver.Chrome()
self.id = id
self.url = "https://music.163.com/#/discover/toplist?id={0}".format(self.id)
self.music_url = "http://music.163.com/song/media/outer/url?id={0}.mp3"
self.mat = mating

# 由于页面的歌曲信息都存放在页面的ifram里面,所以使用Selenium进入子页面获得代码然后返回
def get_html(self):
self.browser.get(self.url)
self.browser.switch_to_frame('contentFrame')
iframe = self.browser.page_source
time.sleep(5)
return iframe

# 使用xpath解析页面的信息进行返回
def parse_html(self):
iframe = self.get_html()
html = etree.HTML(iframe)
contents = html.xpath('//tbody/tr')
try:
for content in contents:
name = content.xpath('./td/div/div/div/span/a/b/@title')[0].replace(' ', '')
music_id = content.xpath('./td/div/div/span/@data-res-id')[0].strip()
print(name)
print(music_id)
# 将歌曲的名称与id传给write_music方法进行下载
self.write_music(name, music_id)
# num = content.xpath('./td/div/span[@class="num"]/text()')[0].strip()
# date = content.xpath('./td[@class=" s-fc3"]/span/text()')[0].strip()
# singer = content.xpath('./td/div[@class="text"]/@title')[0].strip()
# # 将歌曲的信息构建成个元组类型
# items = num, name, date, singer
#
# # 每获得一次歌曲信息后返回一次
# yield list(items)

except Exception as e:
print('解析失败!', e.args)

# 获取到目前的日期
def get_date(self):
t = time.localtime()
tt = time.strftime('%Y年%m月%d日', t)
return tt

# 得到歌曲的名称与id,这里进行下载
def write_music(self, name, music_id):
url = self.music_url.format(music_id)
# agent = random.choice(agents)
try:
# headers = {
# 'User-Agent': agent
# }
# music = requests.get(url, headers=headers)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
music = requests.get(url, headers=headers)
if music.status_code == 200:
# 把歌曲保存到目前日期下的榜单名称下面,方便查看
# path = self.get_date() + os.sep + self.mat + os.sep
# 查询是否有这个地址,如果没有递归创建
# if not os.path.exists(path):
# os.makedirs(path)
with open('./wangyiyun/' + name + '.mp3', 'wb') as f:
f.write(music.content)
print('下载 ' + name + " 成功...")

except Exception as e:
print("下载 {0} 失败!!!".format(name), e.args)

# 保存榜单的信息
def write_items(self):
# 将信息保存至目前日期的下面,方便查找
path = self.get_date() + os.sep + self.mat
try:
# 判断是否有这个文件,如果没有者创建
if not os.path.exists(path):
os.makedirs(path)
with open(path + '榜单信息.csv', 'w', encoding='utf-8') as file:
# 使用cvs格式进行保存
csvfile = csv.writer(file)
csvfile.writerow(['排名', '歌名', '时长', '歌手'])
for parse in self.parse_html():
csvfile.writerow(parse)
print("存储信息成功.")
except Exception as e:
print("存储信息失败!", e.args)


def operation(mating, id):
try:
# 实例化对象
music = WangyiMusc(mating, id)
# 获取要爬取的页面的HTML文件
music.get_html()
# 解析页面,将需要的信息进行返回,并开始下载歌曲
music.parse_html()
# 获取目前的日期
# music.get_date()
# 保存需要的信息
# music.write_items()
# print("下载完成...")
# 关闭browser
music.browser.close()
except Exception as e:
print("下载失败!", e.args)


# 选择的界面
def user_select():
# 每个榜单的id
ids = ['19723756', '3779629', '2884035', '3778678', '991319590',
'2408901803', '1978921795', '71385702', '2462790889',
'10520166', '3812895', '60131', '71384707', '180106', '60198',
'27135204', '11641012', '120001', '2323534945', '745956260',
'2023401535', '2006508653', '21845217', '112463',
'112504', '64016', '10169002', '1899724']

# 每个榜单的名称
names = ['云音乐飙升榜', '云音乐新歌榜', '网易原创歌曲榜',
'云音乐热歌榜', '江小白YOLO云音乐说唱榜', '公告牌音乐榜', '云音乐电音榜',
'云音乐电音榜', '云音乐ACG音乐榜', 'YY音乐榜', '云音乐国电榜', '云音乐国电榜',
'云音乐国电榜', '云音乐古典音乐榜', 'UK排行榜周榜', '美国Billboard周榜',
'法国 NRJVos Hits 周榜', 'iTunes榜', 'Hit FMTop榜', '说唱TOP榜', '云音乐韩语榜',
'英国Q杂志中文版周榜', '电竞音乐榜', 'KTV唛榜', '台湾Hito排行榜', '中国TOP排行榜(港台榜)',
'中国TOP排行榜(内地榜)', '香港r台中文歌曲龙虎榜', '中国嘻哈榜']
musics = {}
nums = {}

# 将名称与id存放在musics字典当中
for mat in range(len(ids)):
musics[names[mat]] = ids[mat]

# 给每一个榜单设置一个编号
for num in range(1, len(names) + 1):
nums[num] = names[num - 1]

# 输入编号与榜单名称,便于查看
for k, v in nums.items():
print(k, ":", v)

# 将榜单与id打包以元组的方式放入列表当中
music_list = list(musics.items())

# 因为使用4个线程,所以构建一个列表
list1 = [i for i in range(len(music_list)) if i % 4 == 0]

# 选择要下载的榜单
n = int(input('请输入你要下载的榜单(请输入数字,输入0全部提取):'))

# 如果需要全部下载,使用多线程
if n == 0:
for t in list1:
t1 = threading.Thread(target=operation, args=music_list[t])
t2 = threading.Thread(target=operation, args=music_list[t + 1])
t3 = threading.Thread(target=operation, args=music_list[t + 2])
t4 = threading.Thread(target=operation, args=music_list[t + 3])

t1.start()
t2.start()
t3.start()
t4.start()

t1.join()
t2.join()
t3.join()
t4.join()

# 如果只是下载某个榜单,者传入榜单名称与id
id = musics[nums[n]]
mating = nums[n]

operation(mating, id)


if __name__ == '__main__':
# 程序开始
user_select()