【爬虫】python 微博评论数据分析

原文链接

用python爬取微博评论数据,爬虫之路,永无止境。。(附源码)_主打Python的博客-CSDN博客_爬虫微博评论

# !/usr/bin/nev python
# -*-coding:utf8-*-


from datetime import datetime
from requests_html import HTMLSession
import re, time
import csv
# import tkinter as tk
import urllib3  # 解除警告

urllib3.disable_warnings()
session = HTMLSession()


user_url = 'https://weibo.com/2318265821/KrBA7lvW4#comment'
pass_wd = 'WEIBOCN_FROM=1110005030; SUB=_2A25Mx3mlDeRhGeNM41sV8i7KyzWIHXVsSAftrDV6PUJbkdANLUfEkW1NSeR9M3dIjq3lBi61DJC0D26LvrU8YMVV; MLOGIN=1; _T_WM=14744352522; XSRF-TOKEN=781dcc'

f = open(r'评论.csv','a+',newline='')
fileheader = ['a','screen_names', 'genders', 'std_create_times', 'texts', 'like_counts']
fp = csv.DictWriter(f, fileheader) # 定义表头
fp.writeheader() # 写入表头
fp = csv.writer(f)


class WBSpider(object):

    def main(self, user_url, pass_wd):
        i = 1
        a = 1
        headers_1 = {
            'cookie': pass_wd,
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.0 Safari/537.36'

        }
        headers_2 = {
            "referer": "https://m.weibo.cn/status/Kk9Ft0FIg?jumpfrom=weibocom",
            'cookie': pass_wd,
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36'
        }
        # user_url = 'https://weibo.com/2318265821/KrBA7lvW4#comment'
        # print(re.findall('/(.*?)#', user_url))
        uid_1 = re.findall('/(.*?)#', user_url)[0]
        uid_2 = uid_1.split('/', 3)[3]
        # print(uid_2)

        url_1 = f'https://weibo.com/ajax/statuses/show?id={uid_2}'
        prox = ''
        response = session.get(url_1, proxies={'http': prox, 'https': prox}, headers=headers_1,
                               verify=False).content.decode()
        # print(response)
        weibo_id = re.findall('"id":(.*?),"idstr"', response)[0]
        # print(weibo_id)
        # 构造起始地址
        start_url = f'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id_type=0'
        """
                2.发送请求,获取响应: 解析起始的url地址
                :return:
                """
        prox = ''
        response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, verify=False).json()

        """提取翻页的max_id"""
        max_id = response['data']['max_id']
        """提取翻页的max_id_type"""
        max_id_type = response['data']['max_id_type']

        b = len(response['data']['data'])-1
        print('条数',b)
        """构造GET请求参数"""
        data = {
            'id': weibo_id,
            'mid': weibo_id,
            'max_id': max_id,
            'max_id_type': max_id_type
        }
        """解析评论内容"""
        self.parse_response_data(response, i,a)
        i += 1
        a += b
        print('总条数',a)
        """参数传递,方法回调"""
        self.parse_page_func(data, weibo_id, headers_2, i,a)

    def parse_page_func(self, data, weibo_id, headers_2, i,a):
        """
        :return:
        """

        start_url = 'https://m.weibo.cn/comments/hotflow?'
        prox = ''
        response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, params=data,
                               verify=False).json()
        """提取翻页的max_id"""
        max_id = response['data']['max_id']
        """提取翻页的max_id_type"""
        max_id_type = response['data']['max_id_type']
        b = len(response['data']['data']) -1
        print('条数:',b)
        """构造GET请求参数"""
        data = {
            'id': weibo_id,
            'mid': weibo_id,
            'max_id': max_id,
            'max_id_type': max_id_type
        }
        """解析评论内容"""
        self.parse_response_data(response, i,a)
        i += 1
        a +=b
        print('总条数',a)
        """递归回调"""
        self.parse_page_func(data, weibo_id, headers_2, i,a)

    def parse_response_data(self, response, i,a):
        """
        从响应中提取评论内容
        :return:
        """
        """提取出评论大列表"""
        data_list = response['data']['data']
        # print(data_list)
        for data_json_dict in data_list:
            # 提取评论内容
            try:
                texts_1 = data_json_dict['text']
                """需要sub替换掉标签内容"""
                # 需要替换的内容,替换之后的内容,替换对象
                alts = ''.join(re.findall(r'alt=(.*?) ', texts_1))
                texts = re.sub("<span.*?</span>", alts, texts_1)
                # 点赞量
                like_counts = str(data_json_dict['like_count'])
                # 评论时间   格林威治时间---需要转化为北京时间
                created_at = data_json_dict['created_at']
                std_transfer = '%a %b %d %H:%M:%S %z %Y'
                std_create_times = str(datetime.strptime(created_at, std_transfer))
                # 性别  提取出来的是  f
                gender = data_json_dict['user']['gender']
                genders = '女' if gender == 'f' else '男'
                # 用户名
                screen_names = data_json_dict['user']['screen_name']

                # print(a,screen_names, genders, std_create_times, texts, like_counts)
                data =[a,screen_names, genders, std_create_times, texts, like_counts]
                print(data)

                fp.writerow(data)

                print()
                a=a+1
            except Exception as e:
                continue
        print('*******************************************************************************************')
        print()
        print(f'*****第{i}页评论打印完成*****')


if __name__ == '__main__':
    w = WBSpider()
    w.main(user_url, pass_wd)