【爬虫】python 微博评论数据分析
原文链接
# !/usr/bin/nev python
# -*-coding:utf8-*-
from datetime import datetime
from requests_html import HTMLSession
import re, time
import csv
# import tkinter as tk
import urllib3 # 解除警告
urllib3.disable_warnings()
session = HTMLSession()
user_url = 'https://weibo.com/2318265821/KrBA7lvW4#comment'
pass_wd = 'WEIBOCN_FROM=1110005030; SUB=_2A25Mx3mlDeRhGeNM41sV8i7KyzWIHXVsSAftrDV6PUJbkdANLUfEkW1NSeR9M3dIjq3lBi61DJC0D26LvrU8YMVV; MLOGIN=1; _T_WM=14744352522; XSRF-TOKEN=781dcc'
f = open(r'评论.csv','a+',newline='')
fileheader = ['a','screen_names', 'genders', 'std_create_times', 'texts', 'like_counts']
fp = csv.DictWriter(f, fileheader) # 定义表头
fp.writeheader() # 写入表头
fp = csv.writer(f)
class WBSpider(object):
def main(self, user_url, pass_wd):
i = 1
a = 1
headers_1 = {
'cookie': pass_wd,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.0 Safari/537.36'
}
headers_2 = {
"referer": "https://m.weibo.cn/status/Kk9Ft0FIg?jumpfrom=weibocom",
'cookie': pass_wd,
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Mobile Safari/537.36'
}
# user_url = 'https://weibo.com/2318265821/KrBA7lvW4#comment'
# print(re.findall('/(.*?)#', user_url))
uid_1 = re.findall('/(.*?)#', user_url)[0]
uid_2 = uid_1.split('/', 3)[3]
# print(uid_2)
url_1 = f'https://weibo.com/ajax/statuses/show?id={uid_2}'
prox = ''
response = session.get(url_1, proxies={'http': prox, 'https': prox}, headers=headers_1,
verify=False).content.decode()
# print(response)
weibo_id = re.findall('"id":(.*?),"idstr"', response)[0]
# print(weibo_id)
# 构造起始地址
start_url = f'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id_type=0'
"""
2.发送请求,获取响应: 解析起始的url地址
:return:
"""
prox = ''
response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, verify=False).json()
"""提取翻页的max_id"""
max_id = response['data']['max_id']
"""提取翻页的max_id_type"""
max_id_type = response['data']['max_id_type']
b = len(response['data']['data'])-1
print('条数',b)
"""构造GET请求参数"""
data = {
'id': weibo_id,
'mid': weibo_id,
'max_id': max_id,
'max_id_type': max_id_type
}
"""解析评论内容"""
self.parse_response_data(response, i,a)
i += 1
a += b
print('总条数',a)
"""参数传递,方法回调"""
self.parse_page_func(data, weibo_id, headers_2, i,a)
def parse_page_func(self, data, weibo_id, headers_2, i,a):
"""
:return:
"""
start_url = 'https://m.weibo.cn/comments/hotflow?'
prox = ''
response = session.get(start_url, proxies={'http': prox, 'https': prox}, headers=headers_2, params=data,
verify=False).json()
"""提取翻页的max_id"""
max_id = response['data']['max_id']
"""提取翻页的max_id_type"""
max_id_type = response['data']['max_id_type']
b = len(response['data']['data']) -1
print('条数:',b)
"""构造GET请求参数"""
data = {
'id': weibo_id,
'mid': weibo_id,
'max_id': max_id,
'max_id_type': max_id_type
}
"""解析评论内容"""
self.parse_response_data(response, i,a)
i += 1
a +=b
print('总条数',a)
"""递归回调"""
self.parse_page_func(data, weibo_id, headers_2, i,a)
def parse_response_data(self, response, i,a):
"""
从响应中提取评论内容
:return:
"""
"""提取出评论大列表"""
data_list = response['data']['data']
# print(data_list)
for data_json_dict in data_list:
# 提取评论内容
try:
texts_1 = data_json_dict['text']
"""需要sub替换掉标签内容"""
# 需要替换的内容,替换之后的内容,替换对象
alts = ''.join(re.findall(r'alt=(.*?) ', texts_1))
texts = re.sub("<span.*?</span>", alts, texts_1)
# 点赞量
like_counts = str(data_json_dict['like_count'])
# 评论时间 格林威治时间---需要转化为北京时间
created_at = data_json_dict['created_at']
std_transfer = '%a %b %d %H:%M:%S %z %Y'
std_create_times = str(datetime.strptime(created_at, std_transfer))
# 性别 提取出来的是 f
gender = data_json_dict['user']['gender']
genders = '女' if gender == 'f' else '男'
# 用户名
screen_names = data_json_dict['user']['screen_name']
# print(a,screen_names, genders, std_create_times, texts, like_counts)
data =[a,screen_names, genders, std_create_times, texts, like_counts]
print(data)
fp.writerow(data)
print()
a=a+1
except Exception as e:
continue
print('*******************************************************************************************')
print()
print(f'*****第{i}页评论打印完成*****')
if __name__ == '__main__':
w = WBSpider()
w.main(user_url, pass_wd)