mongodb、mongoengine

###pipelines.py中写mysql,图片下载保存、 mongodb,mongoengine

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from mongoengine import connect
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import pymongo


from youyaoqi.models import Comic


class YouyaoqiMysqlPipeline(object):
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.username = user
self.password = password
self.port = port

@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get("MYSQL_HOST"),
database=crawler.settings.get("MYSQL_DATABASE"),
user=crawler.settings.get("MYSQL_USERNAME"),
password=crawler.settings.get("MYSQL_PASSWORD"),
port=crawler.settings.get("MYSQL_PORT"),
)

def open_spider(self, spider):
# 获取数据库连接
self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()

def class_spider(self, spider):
# 释放数据库连接
self.db.close()

def process_item(self, item, spider):
sql = 'insert into sort(comic_id, name, cover, line1, line2, update_type) values ("%s", "%s", "%s", "%s", "%s", "%s")'
self.cursor.execute(sql, (item['comic_id'], item['name'], item['cover'], item['line1'], item['line2'], item['update_type']))
self.db.commit()
return item

# 图片下载保存
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name

def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item

def get_media_requests(self, item, info):
yield Request(item['cover'])


# mongo 管道
class YouyaoqiMongoPipeline(object):
def __init__(self, uri, database):
self.uri = uri
self.database = database

@classmethod
def from_crawler(cls, crawler):
return cls(
uri=crawler.settings.get('MONGO_URI'),
database=crawler.settings.get('MONGO_DB')
)

def open_spider(self, spider):
# 获取数据库连接
self.client = pymongo.MongoClient(self.uri)
self.db = self.client[self.database]

def close_spider(self, spider):
# 释放数据库连接
self.client.close()

def process_item(self, item, spider):
# 插入item数据
collection_name = item.collection
self.db[collection_name].insert(dict(item))
return item


# mongoengine
class YouyaoqiMongoenginePipeline(object):
def __init__(self, database):
self.database = database

@classmethod
def from_crawler(cls, crawler):
return cls(
database=crawler.settings.get('MONGO_DB')
)

def open_spider(self, spider):
# 获取数据库连接
self.con = connect(self.database)

def close_spider(self, spider):
# 释放数据库连接
self.con.close()

def process_item(self, item, spider):
# 插入item数据
if item.collection == 'manhua':
comic = Comic()
comic.comic_id = item['comic_id']
comic.name = item['name']
comic.cover = item['cover']
comic.line1 = item['line1']
comic.line2 = item['line2']
comic.update_type = item['update_type']
comic.save()
return item

###settings.py文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# 第67行代码添加为:
ITEM_PIPELINES = {
# 'youyaoqi.pipelines.YouyaoqiMysqlPipeline': 300,
'youyaoqi.pipelines.YouyaoqiMongoPipeline': 310,
# 'youyaoqi.pipelines.YouyaoqiMongoenginePipeline': 320,

# 'youyaoqi.pipelines.ImagePipeline': 500,

}
# 在最后加上
FEED_EXPORT_ENCODING = 'utf-8'
MAX_PAGES = 414
# 图片存放路径
IMAGES_STORE = './images'

# mysql setting
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_USERNAME = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DATABASE = 'youyaoqi'

# mongo setting
MONGO_URI = 'mongodb://127.0.0.1:27017'
MONGO_DB = 'youyaoqi'

在items.py同级添加一个models.py

1
2
3
4
5
6
7
8
9
10
11
12
# 内容如下

from mongoengine import *


class Comic(Document):
comic_id = StringField(max_length=512)
name = StringField(max_length=512)
cover = StringField(max_length=512)
line1 = StringField(max_length=512)
line2 = StringField(max_length=512)
update_type = StringField(max_length=512)

items.py内容如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class YouyaoqiItem(scrapy.Item):
# define the fields for your item here like:
collection = table = 'manhua'
comic_id = scrapy.Field()
name = scrapy.Field()
cover = scrapy.Field()
line1 = scrapy.Field()
line2 = scrapy.Field()
update_type = scrapy.Field()
pass


class YouyaoqiDetailItem(scrapy.Item):
collection = table = 'manhua_chapter'
name = scrapy.Field()

###爬取代码内容如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
import json

import scrapy

from youyaoqi.items import YouyaoqiItem


class ManhuaSpider(scrapy.Spider):
name = 'manhua'
allowed_domains = ['www.u17.com']
start_urls = ['http://www.u17.com/th99_gr99_ca99_ss99_ob0_ac0_as0_wm0_co99_ct99_p1.html?order = 2']

def headers(self):
headers = {
'Referer': 'http://www.u17.com/comic_list/th99_gr99_ca99_ss99_ob0_ac0_as0_wm0_co99_ct99_p1.html?order=2',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36',
'Host': 'www.u17.com',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
}

# 负责构造请求
def start_requests(self):
data = {'data[group_id]': 'no', 'data[theme_id]': 'no', 'data[is_vip]': 'no',
'data[accredit]': 'no', 'data[color]': 'no', 'data[comic_type]': 'no',
'data[series_status]': 'no', 'data[order]': '2', 'data[page_num]': '1',
'data[read_mode]': 'no'}

max_page = self.settings.get('MAX_PAGES')
base_url = 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list'
for page in range(2, max_page):
data['data[page_num]'] = str(page)
yield scrapy.FormRequest(url=base_url,
headers=self.headers(),
method='POST',
formdata=data,
callback=self.parse)

def parse(self, response):
result_json = json.loads(response.text)
data_list = result_json['comic_list']
for data in data_list:
item = YouyaoqiItem()
item['comic_id'] = data['comic_id']
item['name'] = data['name']
item['cover'] = data['cover']
item['update_type'] = data['update_type']
item['line1'] = data['line1']
item['line2'] = data['line2']
yield item
# 爬取漫画的章节信息及其路由
detail_url = 'http://www.u17.com/comic/%s.html' % item['comic_id']
yield scrapy.Request(url=detail_url, headers=self.headers(), callback=self.parse_detail)

def parse_detail(self, response):
results = response.css('#chapter a::text').extract()
for title in results:
detail_item =YouyaoqiItem()
detail_item['name'] = title.replace('"', '').strip()
yield detail_item

###使用django展示爬取存储在mongodb的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#需在seting添加的代码
STATICFILES_DIRS = [
os.path.join(BASE_DIR, 'static'),
]

import pymongo
clitent = pymongo.MongoClient()
db = clitent['youyaoqi']
# view.py的数据
from django.shortcuts import render
from django.utils.translation import ugettext as _
from django.http import HttpResponse, JsonResponse
from django.utils import translation
from day09_django import settings


def comic_list(request):
# clitent = pymongo.MongoClient()
# db = clitent['youyaoqi']
db = settings.db
manhua_result = db.manhua.find()

context = {'manhua_result': manhua_result}

return render(request, 'comic_list.html', context)

# urls.py的代码(dajango2和django1不同)
from django.contrib import admin
from django.urls import path

from comic.views import comic_list, report, report_data

urlpatterns = [
path('admin/', admin.site.urls),
path('comic_list/', comic_list),
]

# comic_list.index页面展示maongodb数据
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>有妖气</title>
</head>
<body>
<table>
{% for item in manhua_result %}
<tr>
<td>{{ item.name }}</td>
<td><img src="{{item.cover}}"></td>
</tr>
{% endfor %}
</table>
</body>
</html>