|
import requests from selenium import webdriver from PIL import Image import xlwt import time import re import json from collections.abc import Iterable import csv n = 1header = { 'Host': 'h5.qzone.qq.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Accept': '*/*', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736', 'Connection':'keep-alive'} #对表格进行处理file = xlwt.Workbook() table = file.add_sheet('sheet name', cell_overwrite_ok= True) table.write(0,1,"日期") table.write(0,2,"时间") table.write(0,3,"年份") table.write(0,4,"月份") table.write(0,5,"时间点") table.write(0,6,"图片数量") table.write(0,7,"评论数量") table.write(0,8,"手机型号") table.write(0,9,"经度") table.write(0,10,"地点") table.write(0,11,"纬度") table.write(0,12,"位置") table.write(0,13,"内容") #从邮箱里到导出的联系人,然后进行读取操作def getfriends(): csv_reader = csv.reader(open('qq.csv')) friend = [] for row in csv_reader: friend.append(row[3]) friend.pop(0) friends = [] for f in friend: f = f[:-7] friends.append(f) return friends # 这个函数用来解决腾讯g_tk加密算法的函数def get_g_tk(cookie): hashes = 5381 for letter in cookie['p_skey']: hashes += (hashes << 5) + ord(letter) # ord()是用来返回字符的ascii码 return hashes & 0x7fffffff#这个函数是用来获取cookie,g_tk,g_qzontoken这三个数据def Login_QQ(): driver = webdriver.Chrome() start_url = "https://qzone.qq.com/" driver.get(start_url) time.sleep(10) cookie = {} for elem in driver.get_cookies(): cookie[elem['name']] = elem['value'] html = driver.page_source g_qzonetoken = re.search(r'window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html) g_tk = get_g_tk(cookie) driver.quit() print(g_qzonetoken.group(1)) return (cookie, g_tk, g_qzonetoken.group(1)) def spyder_info(): s = requests.session() friends = getfriends() cookie, g_tk, g_qzonetoken = Login_QQ() for qq in friends: global n for i in range(1000): pos = i*20 param = { 'uin': qq, 'ftype': '0', 'sort': '0', 'pos': pos, 'num': '20', 'replynum': '100', 'g_tk': g_tk, 'callback': '_preloadCallback', 'code_version': '1', 'format': 'jsonp', 'need_private_comment': '1', 'qzonetoken': g_qzonetoken } respond = s.get("https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6", params=param, headers = header,cookies=cookie) r = re.sub("_preloadCallback", "", respond.text) test = r[1:-2] Data = json.loads(test) if not re.search('lbs', test): # 通过lbs判断此qq的说说是否爬取完毕 print('%s说说下载完成' % qq) break try: for each_data in Data["msglist"]: # 说说发表的日期1 table.write(n, 1, each_data["createTime"]) # 说说发表的时间2 table.write(n, 2, time.strftime("%H:%M:%S", time.localtime(each_data["created_time"]))) # 说说发表的年份3 table.write(n, 3, time.strftime("%Y", time.localtime(each_data["created_time"]))) # 说说发表的月份4 table.write(n, 4, time.strftime("%m", time.localtime(each_data["created_time"]))) # 说说发表的小时5 table.write(n, 5, time.strftime("%H", time.localtime(each_data["created_time"]))) # 统计图片的数量6,以及用的手机型号8 if "pic" in each_data: table.write(n, 6, each_data["pictotal"]) table.write(n, 8, each_data["source_name"]) else: table.write(n, 6, 0) table.write(n, 8, "") # 统计每个说说的评论数量 if each_data["commentlist"]: table.write(n, 7, each_data["commentlist"][-1]["tid"]) else: table.write(n, 7, 0) # 获取该条说说的发表位置:9,10,11,12 if "story_info" in each_data: table.write(n, 9, each_data["story_info"]["lbs"]["pos_x"]) table.write(n, 11, each_data["story_info"]["lbs"]["name"]) table.write(n, 10, each_data["story_info"]["lbs"]["pos_y"]) table.write(n, 12, each_data["story_info"]["lbs"]["idname"]) else: table.write(n, 9, "") table.write(n, 11, "") table.write(n, 10, "") table.write(n, 12, "") # 获取每一条说说的内容13 if each_data["content"]: table.write(n, 13, each_data["conlist"][0]["con"]) else: table.write(n, 13, "") n = n + 1 except: print("error") file.save("demo6.xls") #if '__name__' == '__main__': spyder_info() #来自知乎“牛逼韩”的爬法
|
|