|
求教,Python 多线程问题 ,多线程应该如何修改?
多线程爬虫求教,感谢!!
1.线程锁,释放报错
错误代码:release unlocked lock
2.下载部分代码不执行
抄抄改改完成的,比较渣,见谅!
需要如何修改完成多线程呢?求教!
# -*- coding: UTF-8 -*-import requests,os from lxml import etree from fake_useragent import UserAgent import threading from queue import Queue lock = threading.Lock() #创建一把锁#生产者模式class Procuder(threading.Thread): def __init__(self,page_queue,down_queue,*args,**kwargs): super(Procuder,self).__init__(*args,**kwargs) self.page_queue=page_queue self.down_queue=down_queue self.ua = UserAgent() self.headers = {"User-Agent": self.ua.random} def run(self): while True: if self.page_queue.empty(): break url = self.page_queue.get() self.parse(url) def parse(self,url): print(f'>>> 正在抓取列表页 {url} 数据...') response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk") req = etree.HTML(response) urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href') print(len(urllist)) print(urllist) for href in urllist: try: self.parse_page(href) except Exception as e: print(f'获取详情数据失败,错误代码:{e}') def parse_page(self,url): print(f'>>> 正在抓取详情页 {url} 数据...') response = requests.get(url, headers=self.headers, timeout=6).content.decode("gbk") req = etree.HTML(response) h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0] print(h2) os.makedirs(f'{h2}/', exist_ok=True) article = req.xpath('//div[@class="contentinfo"]/table//text()') article = ''.join(article) article = article.strip() print(article) texts = f'{h2}\n{article}' self.get_text(h2, texts) imgs = req.xpath('//div[@class="contentinfo"]/table//@src') if imgs: global lock lock.acquire() # 加锁 i = 1 for img in imgs: img_url = f'http://www.uimaker.com{img}' suffix = os.path.splitext(img)[1] img_name = f'{i}{suffix}' print(img_url, img_name) i=i+1 lock.release() # 解锁 self.get_downimg(h2, img_url, img_name) if int(req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]) == 0: down_url = req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0] down_name = f'{h2}/{h2}.rar' print(down_url, down_name) self.down_queue.put(down_url,down_name) # 保存文本内容 def get_text(self, h2, texts): print("开始保存文本内容...") with open(f'{h2}/{h2}.txt', 'w', encoding="utf-8") as f: f.write(texts) print(">>>保存文本内容完成!") # 下载图片 def get_downimg(self, h2, img_url, img_name): print("开始下载图片...") r = requests.get(img_url, headers=self.headers, timeout=6) with open(f'{h2}/{img_name}', 'wb') as f: f.write(r.content) print(">>>下载图片完成!") #消费者模式class Consumer(threading.Thread): def __init__(self,page_queue,down_queue,*args,**kwargs): super(Consumer,self).__init__(*args,**kwargs) self.page_queue=page_queue self.down_queue=down_queue self.ua = UserAgent() self.headers = {"User-Agent": self.ua.random} def run(self): while True: if self.page_queue.empty() and self.down_queue.empty(): break down_url,down_name=self.down_queue.get() self.down(down_url,down_name) #下载素材 def down(self,down_url,down_name): print("开始下载素材...") r=requests.get(down_url,headers=self.headers,timeout=6) with open(down_name,'wb') as f: f.write(r.content) print(">>>下载素材完成!") def main(): page_queue=Queue(1000) down_queue=Queue(500) for i in range(1, 71): url = f"http://www.uimaker.com/uimakerdown/list_36_{i}.html" print(f'>>> 正在爬取 第{i + 1}页 列表页,链接:{url} ...') page_queue.put(url) for x in range(3): t=Procuder(page_queue,down_queue) t.start() for x in range(5): t=Consumer(page_queue,down_queue) t.start() if __name__=='__main__': main()
|
|