1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
| # -*- coding: utf-8 -*-
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from mongoengine import connect from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline import pymongo
from youyaoqi.models import Comic
class YouyaoqiMysqlPipeline(object): def __init__(self, host, database, user, password, port): self.host = host self.database = database self.username = user self.password = password self.port = port
@classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get("MYSQL_HOST"), database=crawler.settings.get("MYSQL_DATABASE"), user=crawler.settings.get("MYSQL_USERNAME"), password=crawler.settings.get("MYSQL_PASSWORD"), port=crawler.settings.get("MYSQL_PORT"), )
def open_spider(self, spider): # 获取数据库连接 self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8', port=self.port) self.cursor = self.db.cursor()
def class_spider(self, spider): # 释放数据库连接 self.db.close()
def process_item(self, item, spider): sql = 'insert into sort(comic_id, name, cover, line1, line2, update_type) values ("%s", "%s", "%s", "%s", "%s", "%s")' self.cursor.execute(sql, (item['comic_id'], item['name'], item['cover'], item['line1'], item['line2'], item['update_type'])) self.db.commit() return item
# 图片下载保存 class ImagePipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): url = request.url file_name = url.split('/')[-1] return file_name
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem('Image Downloaded Failed') return item
def get_media_requests(self, item, info): yield Request(item['cover'])
# mongo 管道 class YouyaoqiMongoPipeline(object): def __init__(self, uri, database): self.uri = uri self.database = database
@classmethod def from_crawler(cls, crawler): return cls( uri=crawler.settings.get('MONGO_URI'), database=crawler.settings.get('MONGO_DB') )
def open_spider(self, spider): # 获取数据库连接 self.client = pymongo.MongoClient(self.uri) self.db = self.client[self.database]
def close_spider(self, spider): # 释放数据库连接 self.client.close()
def process_item(self, item, spider): # 插入item数据 collection_name = item.collection self.db[collection_name].insert(dict(item)) return item
# mongoengine class YouyaoqiMongoenginePipeline(object): def __init__(self, database): self.database = database
@classmethod def from_crawler(cls, crawler): return cls( database=crawler.settings.get('MONGO_DB') )
def open_spider(self, spider): # 获取数据库连接 self.con = connect(self.database)
def close_spider(self, spider): # 释放数据库连接 self.con.close()
def process_item(self, item, spider): # 插入item数据 if item.collection == 'manhua': comic = Comic() comic.comic_id = item['comic_id'] comic.name = item['name'] comic.cover = item['cover'] comic.line1 = item['line1'] comic.line2 = item['line2'] comic.update_type = item['update_type'] comic.save() return item
|