|
# -*- coding: utf-8 -*- """ ========================================= @File: 111.py @Author: 鸢公子 @E-mail: [email protected] @Time: 四月 30, 2020 @Desc: ========================================= """ import json import scrapy from scrapy import Request class ZqTzSpider(scrapy.Spider): name = '某网' uu = '部分网址' def parse(self, response): row = json.loads(response.text) [item.setdefault('pop', 'amac_code') for item in row['content']] yield from row['content'] yield from [Request(self.uu % item['url'], self.parse_item, meta={'id': item['id']}) for item in row['content']] @staticmethod def item_up(d1: dict, d2: dict): d1.update(d2) return d1 def parse_item(self, response): """机构信息, 异常信息, 出资人信息""" items = list() """机构信息""" items.append(self.item_up( {'pid': response.meta['id'], 'pop': '机构信息表', 'from_url': response.url, '更新': response.xpath('//tr[td[text()="特别提示信息"]]/td[2]/text()').get(''), '提示': response.xpath('//tr[td[text()="特别提示信息"]]/td[4]/text()').get('').strip()}, {''.join([i.strip() for i in tr.xpath('./td[@class="title"]//text()').getall()]): ''.join( set([i.strip() for i in tr.xpath('./td[not(@class="title")]//text()').getall()])).replace('投诉', '') for tr in response.xpath('//div[div/span[text()="机构信息"]]//tr')})) """异常信息""" [items.append({ 'pid': response.meta['id'], 'pop': '异常信息表', # 异常信息表 'from_url': response.url, '异常': tr.xpath('./td[1]/text()').get(), '事件': ''.join([i.strip() for i in tr.xpath('./td[2]//text()').getall()]) }) for tr in response.xpath('//tr[td[text()="机构诚信信息"]]//tbody/tr')] """出资人信息""" [items.append({ 'pid': response.meta['id'], 'pop': 'amac_chu_zi', # 出资人信息表 'from_url': response.url, '姓名/名称': tr.xpath('./td[2]/text()').get(''), '持股比例': tr.xpath('./td[3]/text()').get('') }) for tr in response.xpath('//div[span[text()="出资人信息"]]/following-sibling::div//tbody//tbody/tr')] yield from items |
|