ytq520

echarts报表、如何使用scrapy下的middlewares

###django使用echarts报表

# 在urls生成路由
from django.contrib import admin
from django.urls import path

from comic.views import report, report_data

urlpatterns = [
    path('admin/', admin.site.urls),
    path('report/', report),
    path('report_data/', report_data),
]
# views.py界面内容
from django.shortcuts import render
from django.http import HttpResponse, JsonResponse
def report_data(request):
    json_data = {'xdata': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 'ydata': [820, 932, 901, 934, 1290, 1330, 1320]}
    return JsonResponse(json_data)


def report(request):
    context = {'xdata': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 'ydata': [820, 932, 901, 934, 1290, 1330, 1320]}
    return render(request, 'report.html', context)
# report界面展示echarts报表
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>ECharts</title>
    <!-- 引入 echarts.js -->
    <script src="/static/js/echarts.min.js"></script>
    <script src="/static/js/jquery.min.js"></script>
</head>
<body>
    <!-- 为ECharts准备一个具备大小（宽高）的Dom -->
    <div id="main" style="width: 600px;height:400px;"></div>

    <script type="text/javascript">
        // 基于准备好的dom，初始化echarts实例
        var myChart = echarts.init(document.getElementById('main'));

        // 指定图表的配置项和数据
        <!--option = {-->
            <!--xAxis: {-->
                <!--type: 'category',-->
                <!--data: {{ xdata | safe }}-->
            <!--},-->
            <!--yAxis: {-->
                <!--type: 'value'-->
            <!--},-->
            <!--series: [{-->
                <!--data: {{ ydata }},-->
                <!--type: 'bar'-->
            <!--}]-->
        <!--};-->

        <!--&lt;!&ndash;// 使用刚指定的配置项和数据显示图表。&ndash;&gt;-->
        <!--myChart.setOption(option);-->

        $.get('/report_data', function (data) {
            option = {
                xAxis: {
                    type: 'category',
                    data: data['xdata']
                },
                yAxis: {
                    type: 'value'
                },
                series: [{
                    data: data['ydata'],
                    type: 'bar'
                }]
            };
            myChart.setOption(option)
        });
    </script>
</body>
</html>

###使用scrapy爬取京东(使用middlewares)

#settings.py内容
#取消55-57的注释，如下
DOWNLOADER_MIDDLEWARES = {
   'jd.middlewares.SeleniumMiddleware': 543,
}

KEYWORDS = ['月饼']
MAX_PAGE = 110
SELENIUM_TIMEOUT = 10

IPPOOL=[
	{"ipaddr":"115.223.202.210:9000"},
	{"ipaddr":"115.223.252.198:9000"},
	{"ipaddr":"114.234.82.76:9000"},
	{"ipaddr":"115.223.241.43:9000"},
	{"ipaddr":"180.118.92.248:9000"},
	{"ipaddr":"115.223.241.109:9000"},
	{"ipaddr":"27.206.74.114:9000"}
]
FEED_EXPORT_ENCODING = 'utf-8'

###middlewares.py代码如下


# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time
import random
from jd.settings import IPPOOL
from scrapy import signals
import time


class SeleniumMiddleware():
    def __init__(self, timeout=None, service_args=[]):
        self.timeout = timeout
        chromeOptions = webdriver.ChromeOptions()
        # chromeOptions.add_argument("--proxy-server=%s" % request.meta["proxy"])

        self.browser = webdriver.Chrome(chrome_options = chromeOptions)            
        self.browser.set_window_size(1400, 700)
        self.wait = WebDriverWait(self.browser, self.timeout)

    def __del__(self):
        self.browser.close()
    
    def process_request(self, request, spider):
        """
        抓取页面
        :param request: Request对象
        :param spider: Spider对象
        :return: HtmlResponse
        """
        page = request.meta.get('page', 1)
        try:
            # print(request.meta["proxy"])


            self.browser.get(request.url)
            self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight / 8)')

            time.sleep(2)
            self.browser.execute_script('window.scrollTo(0, 2 * document.body.scrollHeight / 8)')
            time.sleep(2)

            self.browser.execute_script('window.scrollTo(0, 3 * document.body.scrollHeight / 8)')
            time.sleep(2)

            self.browser.execute_script('window.scrollTo(0, 4 * document.body.scrollHeight / 8)')
            time.sleep(2)

            self.browser.execute_script('window.scrollTo(0, 5 * document.body.scrollHeight / 8)')
            time.sleep(2)

            self.browser.execute_script('window.scrollTo(0, 6 * document.body.scrollHeight / 8)')
            time.sleep(2)

            self.browser.execute_script('window.scrollTo(0, 7 * document.body.scrollHeight / 8)')
            time.sleep(2)
            self.browser.execute_script('window.scrollTo(0, 8 * document.body.scrollHeight / 8)')

            if page > 1:
                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage .input-txt')))
                submit = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage a.btn-default')))
                input.clear()
                input.send_keys(page)
                submit.click()
            self.wait.until(
                EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_topPage b'), str(page)))

 
            self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage .input-txt')))
            # self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
            return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
                                status=200)
        except TimeoutException:
            return HtmlResponse(url=request.url, status=500, request=request)
    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))

###items.py文件如下

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class JdItem(Item):
    # define the fields for your item here like:
    collection = 'products'
    
    image = Field()
    price = Field()
    deal = Field()
    title = Field()
    shop = Field()
    location = Field()

spiders下的派取京东数据代码

# -*- coding: utf-8 -*-
from scrapy import Request, Spider
from urllib.parse import quote
from jd.items import JdItem
from urllib.parse import urlencode

class DongziSpider(Spider):
    name = 'dongzi'
    allowed_domains = ['search.jd.com']
    start_urls = 'https://search.jd.com/Search?'

    def start_requests(self):
        for keyword in self.settings.get('KEYWORDS'):
            data = {'keyword': keyword, 'enc': 'utf-8', 'wq': keyword}

            for page in range(1, self.settings.get('MAX_PAGE') + 1):
                params = urlencode(data)
                url = self.start_urls + params
       
                print('*' * 20)
                print(url)
                yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)

    def parse(self, response):
        products = response.xpath(
            '//div[@id="J_goodsList"]//li[@class="gl-item"]/div[contains(@class, "gl-i-wrap")]')
        
        print('-' * 20)
        print(len(products))

        for product in products:
            item = JdItem()
            item['price'] = ''.join(product.xpath('.//div[contains(@class, "p-price")]//i[1]/text()').extract()).strip()
            item['title'] = ''.join(product.xpath('.//div[contains(@class, "p-name")]//text()').extract()).strip()
            # item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
            # item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
            # item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
            # item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
            yield item