回答

收藏

Python 爬QQ空间 有没有优雅一点的逻辑

问答交流 问答交流 815 人阅读 | 0 人回复 | 2021-09-13

import requests
from selenium import webdriver
from PIL import Image
import xlwt
import time
import re
import json
from collections.abc import Iterable
import csv

n = 1header = {
'Host': 'h5.qzone.qq.com',    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',    'Accept': '*/*',    'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',    'Accept-Encoding': 'gzip, deflate, br',    'Referer': 'https://user.qzone.qq.com/790178228?_t_=0.22746974226377736',    'Connection':'keep-alive'}


#对表格进行处理file = xlwt.Workbook()
table = file.add_sheet('sheet name', cell_overwrite_ok= True)



table.write(0,1,"日期")
table.write(0,2,"时间")
table.write(0,3,"年份")
table.write(0,4,"月份")
table.write(0,5,"时间点")
table.write(0,6,"图片数量")
table.write(0,7,"评论数量")
table.write(0,8,"手机型号")
table.write(0,9,"经度")
table.write(0,10,"地点")
table.write(0,11,"纬度")
table.write(0,12,"位置")
table.write(0,13,"内容")

#从邮箱里到导出的联系人,然后进行读取操作def getfriends():
    csv_reader = csv.reader(open('qq.csv'))
    friend = []
    for row in csv_reader:
        friend.append(row[3])
    friend.pop(0)
    friends = []
    for f in friend:
        f = f[:-7]
        friends.append(f)

    return friends


# 这个函数用来解决腾讯g_tk加密算法的函数def get_g_tk(cookie):
    hashes = 5381    for letter in cookie['p_skey']:
        hashes += (hashes << 5) + ord(letter)  # ord()是用来返回字符的ascii码    return hashes & 0x7fffffff#这个函数是用来获取cookie,g_tk,g_qzontoken这三个数据def Login_QQ():
    driver = webdriver.Chrome()
    start_url = "https://qzone.qq.com/"    driver.get(start_url)
    time.sleep(10)
    cookie = {}
    for elem in driver.get_cookies():
        cookie[elem['name']] = elem['value']
    html = driver.page_source
    g_qzonetoken = re.search(r'window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)
    g_tk = get_g_tk(cookie)
    driver.quit()
    print(g_qzonetoken.group(1))
    return (cookie, g_tk, g_qzonetoken.group(1))

def spyder_info():
    s = requests.session()
    friends = getfriends()
    cookie, g_tk, g_qzonetoken = Login_QQ()
    for qq in friends:
        global n
        for i in range(1000):
            pos = i*20            param = {
                'uin': qq,                'ftype': '0',                'sort': '0',                'pos': pos,                'num': '20',                'replynum': '100',                'g_tk': g_tk,                'callback': '_preloadCallback',                'code_version': '1',                'format': 'jsonp',                'need_private_comment': '1',                'qzonetoken': g_qzonetoken
            }
            respond = s.get("https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6",                                params=param, headers = header,cookies=cookie)
            r = re.sub("_preloadCallback", "", respond.text)
            test = r[1:-2]
            Data = json.loads(test)
            if not re.search('lbs', test):  # 通过lbs判断此qq的说说是否爬取完毕                print('%s说说下载完成' % qq)
                break            try:
                for each_data in Data["msglist"]:
                    # 说说发表的日期1                    table.write(n, 1, each_data["createTime"])
                    # 说说发表的时间2                    table.write(n, 2, time.strftime("%H:%M:%S", time.localtime(each_data["created_time"])))
                    # 说说发表的年份3                    table.write(n, 3, time.strftime("%Y", time.localtime(each_data["created_time"])))
                    # 说说发表的月份4                    table.write(n, 4, time.strftime("%m", time.localtime(each_data["created_time"])))
                    # 说说发表的小时5                    table.write(n, 5, time.strftime("%H", time.localtime(each_data["created_time"])))
                    # 统计图片的数量6,以及用的手机型号8                    if "pic" in each_data:
                        table.write(n, 6, each_data["pictotal"])
                        table.write(n, 8, each_data["source_name"])
                    else:
                        table.write(n, 6, 0)
                        table.write(n, 8, "")

                    # 统计每个说说的评论数量                    if each_data["commentlist"]:
                        table.write(n, 7, each_data["commentlist"][-1]["tid"])
                    else:
                        table.write(n, 7, 0)

                    # 获取该条说说的发表位置:9,10,11,12                    if "story_info" in each_data:
                        table.write(n, 9, each_data["story_info"]["lbs"]["pos_x"])
                        table.write(n, 11, each_data["story_info"]["lbs"]["name"])
                        table.write(n, 10, each_data["story_info"]["lbs"]["pos_y"])
                        table.write(n, 12, each_data["story_info"]["lbs"]["idname"])
                    else:
                        table.write(n, 9, "")
                        table.write(n, 11, "")
                        table.write(n, 10, "")
                        table.write(n, 12, "")

                    # 获取每一条说说的内容13                    if each_data["content"]:
                        table.write(n, 13, each_data["conlist"][0]["con"])
                    else:
                        table.write(n, 13, "")
                    n = n + 1            except:
                print("error")


    file.save("demo6.xls")

#if '__name__' == '__main__':
spyder_info()

#来自知乎“牛逼韩”的爬法


分享到:
回复

使用道具 举报