###pipelines.py中写mysql,图片下载保存、 mongodb,mongoengine1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from mongoengine import connect
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymongo
from youyaoqi.models import Comic
class YouyaoqiMysqlPipeline(object):
    def __init__(self, host, database, user, password, port):
        self.host = host
        self.database = database
        self.username = user
        self.password = password
        self.port = port
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get("MYSQL_HOST"),
            database=crawler.settings.get("MYSQL_DATABASE"),
            user=crawler.settings.get("MYSQL_USERNAME"),
            password=crawler.settings.get("MYSQL_PASSWORD"),
            port=crawler.settings.get("MYSQL_PORT"),
        )
    def open_spider(self, spider):
        # 获取数据库连接
        self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8', port=self.port)
        self.cursor = self.db.cursor()
    def class_spider(self, spider):
        # 释放数据库连接
        self.db.close()
    def process_item(self, item, spider):
        sql = 'insert into sort(comic_id, name, cover, line1, line2, update_type) values ("%s", "%s", "%s", "%s", "%s", "%s")'
        self.cursor.execute(sql, (item['comic_id'], item['name'], item['cover'], item['line1'], item['line2'], item['update_type']))
        self.db.commit()
        return item
# 图片下载保存
class ImagePipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        url = request.url
        file_name = url.split('/')[-1]
        return file_name
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('Image Downloaded Failed')
        return item
    def get_media_requests(self, item, info):
        yield Request(item['cover'])
# mongo 管道
class YouyaoqiMongoPipeline(object):
    def __init__(self, uri, database):
        self.uri = uri
        self.database = database
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            uri=crawler.settings.get('MONGO_URI'),
            database=crawler.settings.get('MONGO_DB')
            )
    def open_spider(self, spider):
        # 获取数据库连接
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.database]
    def close_spider(self, spider):
        # 释放数据库连接
        self.client.close()
    def process_item(self, item, spider):
        # 插入item数据
        collection_name = item.collection
        self.db[collection_name].insert(dict(item))
        return item
# mongoengine
class YouyaoqiMongoenginePipeline(object):
    def __init__(self, database):
        self.database = database
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            database=crawler.settings.get('MONGO_DB')
            )
    def open_spider(self, spider):
        # 获取数据库连接
        self.con = connect(self.database)
    def close_spider(self, spider):
        # 释放数据库连接
        self.con.close()
    def process_item(self, item, spider):
        # 插入item数据
        if item.collection == 'manhua':
            comic = Comic()
            comic.comic_id = item['comic_id']
            comic.name = item['name']
            comic.cover = item['cover']
            comic.line1 = item['line1']
            comic.line2 = item['line2']
            comic.update_type = item['update_type']
            comic.save()
        return item
###settings.py文件1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25# 第67行代码添加为:
ITEM_PIPELINES = {
   # 'youyaoqi.pipelines.YouyaoqiMysqlPipeline': 300,
   'youyaoqi.pipelines.YouyaoqiMongoPipeline': 310,
   # 'youyaoqi.pipelines.YouyaoqiMongoenginePipeline': 320,
   # 'youyaoqi.pipelines.ImagePipeline': 500,
}
# 在最后加上
FEED_EXPORT_ENCODING = 'utf-8'
MAX_PAGES = 414
# 图片存放路径
IMAGES_STORE = './images'
# mysql setting
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_USERNAME = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DATABASE = 'youyaoqi'
# mongo setting
MONGO_URI = 'mongodb://127.0.0.1:27017'
MONGO_DB = 'youyaoqi'
在items.py同级添加一个models.py
| 1 | # 内容如下 | 
items.py内容如下
| 1 | # -*- coding: utf-8 -*- | 
###爬取代码内容如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64# -*- coding: utf-8 -*-
import json
import scrapy
from youyaoqi.items import YouyaoqiItem
class ManhuaSpider(scrapy.Spider):
    name = 'manhua'
    allowed_domains = ['www.u17.com']
    start_urls = ['http://www.u17.com/th99_gr99_ca99_ss99_ob0_ac0_as0_wm0_co99_ct99_p1.html?order = 2']
    def headers(self):
        headers = {
            'Referer': 'http://www.u17.com/comic_list/th99_gr99_ca99_ss99_ob0_ac0_as0_wm0_co99_ct99_p1.html?order=2',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36',
            'Host': 'www.u17.com',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
        }
    # 负责构造请求
    def start_requests(self):
        data = {'data[group_id]': 'no', 'data[theme_id]': 'no', 'data[is_vip]': 'no',
                'data[accredit]': 'no', 'data[color]': 'no', 'data[comic_type]': 'no',
                'data[series_status]': 'no', 'data[order]': '2', 'data[page_num]': '1',
                'data[read_mode]': 'no'}
        max_page = self.settings.get('MAX_PAGES')
        base_url = 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list'
        for page in range(2, max_page):
            data['data[page_num]'] = str(page)
            yield scrapy.FormRequest(url=base_url,
                                     headers=self.headers(),
                                     method='POST',
                                     formdata=data,
                                     callback=self.parse)
    def parse(self, response):
        result_json = json.loads(response.text)
        data_list = result_json['comic_list']
        for data in data_list:
            item = YouyaoqiItem()
            item['comic_id'] = data['comic_id']
            item['name'] = data['name']
            item['cover'] = data['cover']
            item['update_type'] = data['update_type']
            item['line1'] = data['line1']
            item['line2'] = data['line2']
            yield item
              # 爬取漫画的章节信息及其路由
  			detail_url = 'http://www.u17.com/comic/%s.html' % item['comic_id']
            yield scrapy.Request(url=detail_url, headers=self.headers(), callback=self.parse_detail)
    def parse_detail(self, response):
        results = response.css('#chapter a::text').extract()
        for title in results:
            detail_item =YouyaoqiItem()
            detail_item['name'] = title.replace('"', '').strip()
            yield detail_item
###使用django展示爬取存储在mongodb的数据1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55#需在seting添加的代码
STATICFILES_DIRS = [
    os.path.join(BASE_DIR, 'static'),
]
import pymongo
clitent = pymongo.MongoClient()
db = clitent['youyaoqi']
# view.py的数据
from django.shortcuts import render
from django.utils.translation import ugettext as _
from django.http import HttpResponse, JsonResponse
from django.utils import translation
from day09_django import settings
def comic_list(request):
    # clitent = pymongo.MongoClient()
    # db = clitent['youyaoqi']
    db = settings.db
    manhua_result = db.manhua.find()
    context = {'manhua_result': manhua_result}
    return render(request, 'comic_list.html', context)
# urls.py的代码(dajango2和django1不同)
from django.contrib import admin
from django.urls import path
from comic.views import comic_list, report, report_data
urlpatterns = [
    path('admin/', admin.site.urls),
    path('comic_list/', comic_list),
]
# comic_list.index页面展示maongodb数据
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>有妖气</title>
</head>
<body>
<table>
    {% for item in manhua_result %}
    <tr>
        <td>{{ item.name }}</td>
        <td><img src="{{item.cover}}"></td>
    </tr>
    {% endfor %}
</table>
</body>
</html>