echarts报表、如何使用scrapy下的middlewares

###django使用echarts报表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# 在urls生成路由
from django.contrib import admin
from django.urls import path

from comic.views import report, report_data

urlpatterns = [
path('admin/', admin.site.urls),
path('report/', report),
path('report_data/', report_data),
]
# views.py界面内容
from django.shortcuts import render
from django.http import HttpResponse, JsonResponse
def report_data(request):
json_data = {'xdata': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 'ydata': [820, 932, 901, 934, 1290, 1330, 1320]}
return JsonResponse(json_data)


def report(request):
context = {'xdata': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 'ydata': [820, 932, 901, 934, 1290, 1330, 1320]}
return render(request, 'report.html', context)
# report界面展示echarts报表
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>ECharts</title>
<!-- 引入 echarts.js -->
<script src="/static/js/echarts.min.js"></script>
<script src="/static/js/jquery.min.js"></script>
</head>
<body>
<!-- 为ECharts准备一个具备大小(宽高)的Dom -->
<div id="main" style="width: 600px;height:400px;"></div>

<script type="text/javascript">
// 基于准备好的dom,初始化echarts实例
var myChart = echarts.init(document.getElementById('main'));

// 指定图表的配置项和数据
<!--option = {-->
<!--xAxis: {-->
<!--type: 'category',-->
<!--data: {{ xdata | safe }}-->
<!--},-->
<!--yAxis: {-->
<!--type: 'value'-->
<!--},-->
<!--series: [{-->
<!--data: {{ ydata }},-->
<!--type: 'bar'-->
<!--}]-->
<!--};-->

<!--&lt;!&ndash;// 使用刚指定的配置项和数据显示图表。&ndash;&gt;-->
<!--myChart.setOption(option);-->

$.get('/report_data', function (data) {
option = {
xAxis: {
type: 'category',
data: data['xdata']
},
yAxis: {
type: 'value'
},
series: [{
data: data['ydata'],
type: 'bar'
}]
};
myChart.setOption(option)
});
</script>
</body>
</html>

###使用scrapy爬取京东(使用middlewares)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#settings.py内容
#取消55-57的注释,如下
DOWNLOADER_MIDDLEWARES = {
'jd.middlewares.SeleniumMiddleware': 543,
}

KEYWORDS = ['月饼']
MAX_PAGE = 110
SELENIUM_TIMEOUT = 10

IPPOOL=[
{"ipaddr":"115.223.202.210:9000"},
{"ipaddr":"115.223.252.198:9000"},
{"ipaddr":"114.234.82.76:9000"},
{"ipaddr":"115.223.241.43:9000"},
{"ipaddr":"180.118.92.248:9000"},
{"ipaddr":"115.223.241.109:9000"},
{"ipaddr":"27.206.74.114:9000"}
]
FEED_EXPORT_ENCODING = 'utf-8'

###middlewares.py代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time
import random
from jd.settings import IPPOOL
from scrapy import signals
import time


class SeleniumMiddleware():
def __init__(self, timeout=None, service_args=[]):
self.timeout = timeout
chromeOptions = webdriver.ChromeOptions()
# chromeOptions.add_argument("--proxy-server=%s" % request.meta["proxy"])

self.browser = webdriver.Chrome(chrome_options = chromeOptions)
self.browser.set_window_size(1400, 700)
self.wait = WebDriverWait(self.browser, self.timeout)

def __del__(self):
self.browser.close()

def process_request(self, request, spider):
"""
抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
page = request.meta.get('page', 1)
try:
# print(request.meta["proxy"])


self.browser.get(request.url)
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight / 8)')

time.sleep(2)
self.browser.execute_script('window.scrollTo(0, 2 * document.body.scrollHeight / 8)')
time.sleep(2)

self.browser.execute_script('window.scrollTo(0, 3 * document.body.scrollHeight / 8)')
time.sleep(2)

self.browser.execute_script('window.scrollTo(0, 4 * document.body.scrollHeight / 8)')
time.sleep(2)

self.browser.execute_script('window.scrollTo(0, 5 * document.body.scrollHeight / 8)')
time.sleep(2)

self.browser.execute_script('window.scrollTo(0, 6 * document.body.scrollHeight / 8)')
time.sleep(2)

self.browser.execute_script('window.scrollTo(0, 7 * document.body.scrollHeight / 8)')
time.sleep(2)
self.browser.execute_script('window.scrollTo(0, 8 * document.body.scrollHeight / 8)')

if page > 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage .input-txt')))
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage a.btn-default')))
input.clear()
input.send_keys(page)
submit.click()
self.wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_topPage b'), str(page)))


self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage .input-txt')))
# self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)

@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))

###items.py文件如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class JdItem(Item):
# define the fields for your item here like:
collection = 'products'

image = Field()
price = Field()
deal = Field()
title = Field()
shop = Field()
location = Field()

spiders下的派取京东数据代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
from urllib.parse import quote
from jd.items import JdItem
from urllib.parse import urlencode

class DongziSpider(Spider):
name = 'dongzi'
allowed_domains = ['search.jd.com']
start_urls = 'https://search.jd.com/Search?'

def start_requests(self):
for keyword in self.settings.get('KEYWORDS'):
data = {'keyword': keyword, 'enc': 'utf-8', 'wq': keyword}

for page in range(1, self.settings.get('MAX_PAGE') + 1):
params = urlencode(data)
url = self.start_urls + params

print('*' * 20)
print(url)
yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)

def parse(self, response):
products = response.xpath(
'//div[@id="J_goodsList"]//li[@class="gl-item"]/div[contains(@class, "gl-i-wrap")]')

print('-' * 20)
print(len(products))

for product in products:
item = JdItem()
item['price'] = ''.join(product.xpath('.//div[contains(@class, "p-price")]//i[1]/text()').extract()).strip()
item['title'] = ''.join(product.xpath('.//div[contains(@class, "p-name")]//text()').extract()).strip()
# item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
# item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
# item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
# item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
yield item