回答

收藏

求教,Python 多线程问题 ,多线程应该如何修改?

问答交流 问答交流 1459 人阅读 | 0 人回复 | 2020-03-12


求教,Python 多线程问题 ,多线程应该如何修改?

多线程爬虫求教,感谢!!

1.线程锁,释放报错

错误代码:release unlocked lock

2.下载部分代码不执行




抄抄改改完成的,比较渣,见谅!

需要如何修改完成多线程呢?求教!
# -*- coding: UTF-8 -*-import requests,os
from lxml import etree
from fake_useragent import UserAgent
import threading
from queue import Queue

lock = threading.Lock() #创建一把锁#生产者模式class Procuder(threading.Thread):
    def __init__(self,page_queue,down_queue,*args,**kwargs):
        super(Procuder,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.down_queue=down_queue
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}

    def run(self):
        while True:
            if self.page_queue.empty():
                break            url = self.page_queue.get()
            self.parse(url)

    def parse(self,url):
        print(f'>>> 正在抓取列表页 {url} 数据...')
        response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk")
        req = etree.HTML(response)
        urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href')
        print(len(urllist))
        print(urllist)
        for href in urllist:
            try:
                self.parse_page(href)
            except Exception as e:
                print(f'获取详情数据失败,错误代码:{e}')


    def parse_page(self,url):
        print(f'>>> 正在抓取详情页 {url} 数据...')
        response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk")
        req = etree.HTML(response)
        h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0]
        print(h2)
        os.makedirs(f'{h2}/', exist_ok=True)
        article = req.xpath('//div[@class="contentinfo"]/table//text()')
        article = ''.join(article)
        article = article.strip()
        print(article)
        texts = f'{h2}\n{article}'        self.get_text(h2, texts)

        imgs = req.xpath('//div[@class="contentinfo"]/table//@src')
        if imgs:
            global lock
            lock.acquire()  # 加锁            i = 1            for img in imgs:
                img_url = f'http://www.uimaker.com{img}'                suffix = os.path.splitext(img)[1]
                img_name = f'{i}{suffix}'                print(img_url, img_name)
                i=i+1                lock.release()  # 解锁                self.get_downimg(h2, img_url, img_name)

        if int(req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]) == 0:
            down_url = req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]
            down_name = f'{h2}/{h2}.rar'            print(down_url, down_name)
            self.down_queue.put(down_url,down_name)




    # 保存文本内容    def get_text(self, h2, texts):
        print("开始保存文本内容...")
        with open(f'{h2}/{h2}.txt', 'w', encoding="utf-8") as f:
            f.write(texts)
        print(">>>保存文本内容完成!")

    # 下载图片    def get_downimg(self, h2, img_url, img_name):
        print("开始下载图片...")
        r = requests.get(img_url, headers=self.headers, timeout=6)
        with open(f'{h2}/{img_name}', 'wb') as f:
            f.write(r.content)
        print(">>>下载图片完成!")


#消费者模式class Consumer(threading.Thread):
    def __init__(self,page_queue,down_queue,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.page_queue=page_queue
        self.down_queue=down_queue
        self.ua = UserAgent()
        self.headers = {"User-Agent": self.ua.random}

    def run(self):
        while True:
            if self.page_queue.empty() and self.down_queue.empty():
                break            down_url,down_name=self.down_queue.get()
            self.down(down_url,down_name)

    #下载素材    def down(self,down_url,down_name):
        print("开始下载素材...")
        r=requests.get(down_url,headers=self.headers,timeout=6)
        with open(down_name,'wb') as f:
            f.write(r.content)
        print(">>>下载素材完成!")

def main():
    page_queue=Queue(1000)
    down_queue=Queue(500)
    for i in range(1, 71):
        url = f"http://www.uimaker.com/uimakerdown/list_36_{i}.html"        print(f'>>> 正在爬取 第{i + 1}页 列表页,链接:{url} ...')
        page_queue.put(url)

    for x in range(3):
        t=Procuder(page_queue,down_queue)
        t.start()

    for x in range(5):
        t=Consumer(page_queue,down_queue)
        t.start()


if __name__=='__main__':
    main()


分享到:
回复

使用道具 举报