scarpy框架
1 | 特点:爬取效率⾼、扩展性强、Python编写跨平台运⾏ |
###连接并写入数据库在setting最后写入1
2
3
4
5
6
7
8
9
10
11
12
13
14#将67行取消注释
ITEM_PIPELINES = {
'qichamao.pipelines.QichamaoMysqlPipeline': 300,
}
FEED_EXPORT_ENCODING = 'utf-8'
MAX_PAGES = 200
# mysql settings
MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_USERNAME = 'root'
MYSQL_PASSWORD = '123456'
MYSQL_DATABASE = 'qichamao'
###在pipelines.py中添加并写成如下代码1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36import pymysql
class QichamaoMysqlPipeline(object):
def __init__(self, host, port, database, username, password):
self.host = host
self.port = port
self.database = database
self.username = username
self.password = password
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
port=crawler.settings.get('MYSQL_PORT'),
database=crawler.settings.get('MYSQL_DATABASE'),
username=crawler.settings.get('MYSQL_USERNAME'),
password=crawler.settings.get('MYSQL_PASSWORD'),
)
def open_spider(self, spider):
# 获取数据库连接
self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def class_spider(self, spider):
# 释放数据库连接
self.db.close()
# 插入item数据
def process_item(self, item, spider):
sql = 'insert into company (company_name, company_code, company_desc, logo_url, c_name, c_phone, c_position, c_email) values (%s,%s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql, (item['company_name'], item['company_code'], item['company_desc'], item['logo_url'], item['c_name'], item['c_phone'], item['c_position'], item['c_email']))
self.db.commit()
return item
###在items.py和django的models用法一致1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QichamaoItem(scrapy.Item):
# define the fields for your item here like:
company_name = scrapy.Field()
company_code = scrapy.Field()
company_desc = scrapy.Field()
logo_url = scrapy.Field()
c_name = scrapy.Field()
c_phone = scrapy.Field()
c_position = scrapy.Field()
c_email = scrapy.Field()
pass
###爬取企查猫的数据并保存数据库1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49# -*- coding: utf-8 -*-
import scrapy
import json
from qichamao.items import QichamaoItem
class CatSpider(scrapy.Spider):
name = 'cat'
allowed_domains = ['www.qichamao.com']
start_urls = ['https://www.qichamao.com/cert-wall/']
# 负责构造请求
def start_requests(self):
data = {'pagesize': '9'}
headers = {
'Referer': 'https://www.qichamao.com/cert-wall/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36',
'Host': 'www.qichamao.com',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
}
max_page = self.settings.get('MAX_PAGES')
base_url = 'https://www.qichamao.com/cert-wall/'
for page in range(2, max_page):
data['page'] = str(page)
yield scrapy.FormRequest(url=base_url,
headers=headers,
method='POST',
formdata=data,
callback=self.parse)
# 负责处理请求
def parse(self, response):
result_json = json.loads(response.text)
data_list = result_json['dataList']
for data in data_list:
item = QichamaoItem()
item['company_name'] = data['CompanyName']
item['company_code'] = data['CompanyCode']
item['company_desc'] = data['CompanyBrief']
item['logo_url'] = data['logoUrl']
item['c_name'] = data['c_name']
item['c_phone'] = data['c_phone']
item['c_position'] = data['c_position']
item['c_email'] = data['c_email']
yield item