如果失效了,可以私信我 保证及时更新
- 2023年12月3号更新版 修复问题
- 完整代码
- 微博主体内容
- 微博评论内容
- 一级评论内容
- 二级评论内容
- 微博主体内容获取流程
- 微博评论内容获取流程
- 一级评论内容
- 二级评论内容
- 问题汇总
- csv文件乱码
2023年12月3号更新版 修复问题
参加新闻比赛,需要获取大众对某一方面的态度信息,因此选择微博作为信息收集的一部分
完整代码
微博主体内容
import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport json# 设置为自己的cookiescookies = {'SINAGLOBAL': '1278126679099.0298.1694199077980','SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.','SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6','SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt','ALF': '1733137172','_s_tentry': 'weibo.com','Apache': '435019984104.0236.1701606621998','ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',}def get_the_list_response(q='话题', n='1', p='页码'):headers = {'authority': 's.weibo.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','referer': 'https://s.weibo.com/weibo?q=%23%E6%96%B0%E9%97%BB%E5%AD%A6%E6%95%99%E6%8E%88%E6%80%92%E6%80%BC%E5%BC%A0%E9%9B%AA%E5%B3%B0%23&nodup=1','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'same-origin','sec-fetch-user': '?1','upgrade-insecure-requests': '1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',}params = {'q': q,'nodup': n,'page': p,}response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)return responsedef parse_the_list(text):soup = BeautifulSoup(text)divs = soup.select('div[action-type="feed_list_item"]')lst = []for div in divs:mid = div.get('mid')time = div.select('div.card-feed > div.content > div.from > a:first-of-type')if time:time = time[0].string.strip()else:time = Nonep = div.select('div.card-feed > div.content > p:last-of-type')if p:p = p[0].stringscontent = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()else:content = Nonestar = div.select('ul > li > a > button > span.woo-like-count')if star:star = list(star[0].strings)[0]else:star = Nonelst.append((mid, content, star, time))df = pd.DataFrame(lst, columns=['mid', 'content', 'star', 'time'])return dfdef get_the_list(q, p):df_list = []for i in range(1, p+1):response = get_the_list_response(q=q, p=i)if response.status_code == 200:df = parse_the_list(response.text)df_list.append(df)print(f'第{i}页解析成功!', flush=True)return df_listif __name__ == '__main__':# 先设置cookie,换成自己的;q = '#华为发布会#'p = 20df_list = get_the_list(q, p)df = pd.concat(df_list)df.to_csv(f'{q}.csv', index=False)
微博评论内容
一级评论内容
import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport json# 设置为自己的cookiescookies = { 'SINAGLOBAL': '1278126679099.0298.1694199077980', 'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.', 'SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt', 'ALF': '1733137172', '_s_tentry': 'weibo.com', 'Apache': '435019984104.0236.1701606621998', 'ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048', }# 开始页码,不用修改page_num = 0def get_content_1(uid, mid, the_first=True, max_id=None): headers = {'authority': 'weibo.com','accept': 'application/json, text/plain, */*','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','client-version': 'v2.43.30','referer': 'https://weibo.com/1762257041/NiSAxfmbZ','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','server-version': 'v2023.09.08.4','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69','x-requested-with': 'XMLHttpRequest','x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_', }params = {'is_reload': '1','id': f'{mid}','is_show_bulletin': '2','is_mix': '0','count': '20','uid': f'{uid}','fetch_level': '0','locale': 'zh-CN', }if not the_first:params['flow'] = 0params['max_id'] = max_id else:pass response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers) return responsedef get_content_2(get_content_1_url): headers = {'authority': 'weibo.com','accept': '*/*','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm','origin': 'https://weibo.com','referer': 'https://weibo.com/1762257041/NiSAxfmbZ','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69','x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_', }s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}' s = json.loads(s) s['name'] = get_content_1_url s = json.dumps(s) data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n' response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data) return response.textdef get_once_data(uid, mid, the_first=True, max_id=None): respones_1 = get_content_1(uid, mid, the_first, max_id) url = respones_1.url response_2 = get_content_2(url) df = pd.DataFrame(respones_1.json()['data']) max_id = respones_1.json()['max_id'] return max_id, dfif __name__ == '__main__': # 先在上面设置cookies # 设置好了再进行操作# 自定义 name = '#邹振东诚邀张雪峰来厦门请你吃沙茶面#' uid = '2610806555' mid = '4914095331742409' page = 100# 初始化 df_list = [] max_id = ''for i in range(page):if i == 0:max_id, df = get_once_data(uid=uid, mid=mid)else:max_id, df = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)if df.shape[0] == 0 or max_id == 0:breakelse:df_list.append(df)print(f'第{i}页解析完毕!max_id:{max_id}')df = pd.concat(df_list).astype(str).drop_duplicates() df.to_csv(f'{name}.csv', index=False)
二级评论内容
import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport jsonpage_num = 0cookies = {'SINAGLOBAL': '1278126679099.0298.1694199077980','SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KMhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt','XSRF-TOKEN': '47NC7wE7TMhcqfh1K-4bacK-','ALF': '1697384140','SSOLoginState': '1694792141','SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7IJXuI95RLbWORIsozuK4Ohxs_boeOIedEcczDT3uSAI.','SUB': '_2A25IAAmdDeRhGeFO61AY8i_NwzyIHXVrdHxVrDV8PUNbmtAGLU74kW9NQYCXlmPtQ1DG4kl_wLzqQqkPl_Do1sZu','_s_tentry': 'weibo.com','Apache': '3760261250067.669.1694792155706','ULV': '1694792155740:8:8:4:3760261250067.669.1694792155706:1694767801057','WBPSESS': 'X5DJqu8gKpwqYSp80b4XokKvi4u4_oikBqVmvlBCHvGwXMxtKAFxIPg-LIF7foS715Sa4NttSYqzj5x2Ms5ynKVOM5I_Fsy9GECAYh38R4DQ-gq7M5XOe4y1gOUqvm1hOK60dUKvrA5hLuONCL2ing==',}def get_content_1(uid, mid, the_first=True, max_id=None): headers = { 'authority': 'weibo.com', 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'client-version': 'v2.43.32', 'referer': 'https://weibo.com/1887344341/NhAosFSL4', 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'server-version': 'v2023.09.14.1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69', 'x-requested-with': 'XMLHttpRequest', 'x-xsrf-token': '-UX-uyKz0jmzbTnlkyDEMvSO', } params = { 'is_reload': '1', 'id': f'{mid}', 'is_show_bulletin': '2', 'is_mix': '1', 'fetch_level': '1', 'max_id': '0', 'count': '20', 'uid': f'{uid}', 'locale': 'zh-CN', }if not the_first: params['flow'] = 0 params['max_id'] = max_id else: pass response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers) return responsedef get_content_2(get_content_1_url): headers = { 'authority': 'weibo.com', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm', 'origin': 'https://weibo.com', 'referer': 'https://weibo.com/1762257041/NiSAxfmbZ', 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69', 'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_', }s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}' s = json.loads(s) s['name'] = get_content_1_url s = json.dumps(s) data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n' response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data) return response.textdef get_once_data(uid, mid, the_first=True, max_id=None):respones_1 = get_content_1(uid, mid, the_first, max_id) url = respones_1.url response_2 = get_content_2(url) df = pd.DataFrame(respones_1.json()['data']) max_id = respones_1.json()['max_id'] return max_id, dfif __name__ == '__main__': # 更新cookies# 得到的一级评论信息 df = pd.read_csv('#邹振东诚邀张雪峰来厦门请你吃沙茶面#.csv') # 过滤没有二级评论的一级评论 df = df[df['floor_number']>0]os.makedirs('./二级评论数据/', exist_ok=True) for i in range(df.shape[0]): uid = df.iloc[i]['analysis_extra'].replace('|mid:',':').split(':')[1]mid = df.iloc[i]['mid']page = 100if not os.path.exists(f'./二级评论数据/{mid}-{uid}.csv'):print(f'不存在 ./二级评论数据/{mid}-{uid}.csv')df_list = []max_id_set = set()max_id = '' for j in range(page):if max_id in max_id_set:breakelse:max_id_set.add(max_id)if j == 0:max_id, df_ = get_once_data(uid=uid, mid=mid)else:max_id, df_ = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)if df_.shape[0] == 0 or max_id == 0:breakelse:df_list.append(df_)print(f'{mid}第{j}页解析完毕!max_id:{max_id}')if df_list:outdf = pd.concat(df_list).astype(str).drop_duplicates()print(f'文件长度为{outdf.shape[0]},文件保存为 ./二级评论数据/{mid}-{uid}.csv')outdf.to_csv(f'./二级评论数据/{mid}-{uid}.csv', index=False)else:passelse:print(f'存在 ./二级评论数据/{mid}-{uid}.csv')
微博主体内容获取流程
以华为发布会这一热搜为例子,我们可以通过开发者模式得到信息基本都包含在下面的 div tag中
我们通过网络这一模块进行解析,发现信息基本都存储在 %23 开头的请求之中,接下来分析一下响应内容
这里可以看出响应内容为 html 格式,因此我们可以用xpath或者css来进行解析,这里我们使用BeautifulSoup来解析,解析代码如下:
soup = BeautifulSoup(response.text, 'lxml')divs = soup.select('div[action-type="feed_list_item"]')lst = []for div in divs:mid = div.get('mid')uid = div.select('div.card-feed > div.avator > a')if uid:uid = uid[0].get('href').replace('.com/', '" />).split('?')[1]else:uid = Nonetime = div.select('div.card-feed > div.content > div.from > a:first-of-type')if time:time = time[0].string.strip()else:time = Nonep = div.select('div.card-feed > div.content > p:last-of-type')if p:p = p[0].stringscontent = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()else:content = Nonestar = div.select('ul > li > a > button > span.woo-like-count')if star:star = list(star[0].strings)[0]else:star = Nonelst.append((mid, uid, content, star, time))pd.DataFrame(lst, columns=['mid', 'uid', 'content', 'star', 'time'])
我们可以获得如下结果:
这里的 mid , uid 两个参数是为了下一节获取微博评论内容需要用到的参数,这里不多解释,如果不需要删除就好,接下来我们看一下请求内容。在开始之前,为了对请求解析方便,在这里我们点击一下 查看全部搜索结果
可以发现一个以 weibo 开头的新的请求,和 %23 开头的请求内容类似,但是带了参数 q 和nodup ,再翻页之后我们可以得到 page 这一个参数
我的解析如下:
1. q:话题
2. nudup:是否展示完整内容
3. page:页码
然后可以对这个请求进行模拟,写入 python 代码中,结合之前的解析,发现内容获取 成功!
完整代码如下:
import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport json# 设置为自己的cookiescookies = {'SINAGLOBAL': '1278126679099.0298.1694199077980','SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.','SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6','SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt','ALF': '1733137172','_s_tentry': 'weibo.com','Apache': '435019984104.0236.1701606621998','ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',}def get_the_list_response(q='话题', n='1', p='页码'):headers = {'authority': 's.weibo.com','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','referer': 'https://s.weibo.com/weibo" />,'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'document','sec-fetch-mode': 'navigate','sec-fetch-site': 'same-origin','sec-fetch-user': '?1','upgrade-insecure-requests': '1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',}params = {'q': q,'nodup': n,'page': p,}response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)return responsedef parse_the_list(text):soup = BeautifulSoup(text)divs = soup.select('div[action-type="feed_list_item"]')lst = []for div in divs:mid = div.get('mid')time = div.select('div.card-feed > div.content > div.from > a:first-of-type')if time:time = time[0].string.strip()else:time = Nonep = div.select('div.card-feed > div.content > p:last-of-type')if p:p = p[0].stringscontent = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()else:content = Nonestar = div.select('ul > li > a > button > span.woo-like-count')if star:star = list(star[0].strings)[0]else:star = Nonelst.append((mid, content, star, time))df = pd.DataFrame(lst, columns=['mid', 'content', 'star', 'time'])return dfdef get_the_list(q, p):df_list = []for i in range(1, p+1):response = get_the_list_response(q=q, p=i)if response.status_code == 200:df = parse_the_list(response.text)df_list.append(df)print(f'第{i}页解析成功!', flush=True)return df_listif __name__ == '__main__':# 先设置cookie,换成自己的;q = '#华为发布会#'p = 20df_list = get_the_list(q, p)df = pd.concat(df_list)df.to_csv(f'{q}.csv', index=False)
微博评论内容获取流程
一级评论内容
上一节内容获取了微博主题内容,可以发现并没有什么难点,本来我以为都结束了,队长偏要评论内容,无奈我只好继续解析评论内容,接下来我们来获取微博评论内容,有一点点绕。
首先我们点开评论数较多的微博, 然后点击 后面还有552条评论,点击查看
看到
和上一节一样来查找请求, 发现 buildComments” />
话不多说,往下滑动,多获得几个请求,对得到的请求,分析如下:
每次往下滑动都会出现两个请求,一个是 buildComments” />import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport json# 设置为自己的cookiescookies = { ‘SINAGLOBAL’: ‘1278126679099.0298.1694199077980’, ‘SCF’: ‘ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.’, ‘SUB’: ‘_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6’, ‘SUBP’: ‘0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1–L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt’, ‘ALF’: ‘1733137172’, ‘_s_tentry’: ‘weibo.com’, ‘Apache’: ‘435019984104.0236.1701606621998’, ‘ULV’: ‘1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048’, }# 开始页码,不用修改page_num = 0def get_content_1(uid, mid, the_first=True, max_id=None): headers = {‘authority’: ‘weibo.com’,‘accept’: ‘application/json, text/plain, */*’,‘accept-language’: ‘zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6’,‘client-version’: ‘v2.43.30’,‘referer’: ‘https://weibo.com/1762257041/NiSAxfmbZ’,‘sec-ch-ua’: ‘”Chromium”;v=”116″, “Not)A;Brand”;v=”24″, “Microsoft Edge”;v=”116″‘,‘sec-ch-ua-mobile’: ‘?0’,‘sec-ch-ua-platform’: ‘”Windows”‘,‘sec-fetch-dest’: ’empty’,‘sec-fetch-mode’: ‘cors’,‘sec-fetch-site’: ‘same-origin’,‘server-version’: ‘v2023.09.08.4’,‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69’,‘x-requested-with’: ‘XMLHttpRequest’,‘x-xsrf-token’: ‘F2EEQZrINBfzB2HPPxqTMQJ_’, }params = {‘is_reload’: ‘1’,‘id’: f’{mid}‘,‘is_show_bulletin’: ‘2’,‘is_mix’: ‘0’,‘count’: ’20’,‘uid’: f’{uid}‘,‘fetch_level’: ‘0’,‘locale’: ‘zh-CN’, }if not the_first:params[‘flow’] = 0params[‘max_id’] = max_id else:pass response = requests.get(‘https://weibo.com/ajax/statuses/buildComments’, params=params, cookies=cookies, headers=headers) return responsedef get_content_2(get_content_1_url): headers = {‘authority’: ‘weibo.com’,‘accept’: ‘*/*’,‘accept-language’: ‘zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6’,‘content-type’: ‘multipart/form-data; boundary=—-WebKitFormBoundaryNs1Toe4Mbr8n1qXm’,‘origin’: ‘https://weibo.com’,‘referer’: ‘https://weibo.com/1762257041/NiSAxfmbZ’,‘sec-ch-ua’: ‘”Chromium”;v=”116″, “Not)A;Brand”;v=”24″, “Microsoft Edge”;v=”116″‘,‘sec-ch-ua-mobile’: ‘?0’,‘sec-ch-ua-platform’: ‘”Windows”‘,‘sec-fetch-dest’: ’empty’,‘sec-fetch-mode’: ‘cors’,‘sec-fetch-site’: ‘same-origin’,‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69’,‘x-xsrf-token’: ‘F2EEQZrINBfzB2HPPxqTMQJ_’, }s = ‘{“name”:”https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN”,”entryType”:”resource”,”startTime”:20639.80000001192,”duration”:563,”initiatorType”:”xmlhttprequest”,”nextHopProtocol”:”h2″,”renderBlockingStatus”:”non-blocking”,”workerStart”:0,”redirectStart”:0,”redirectEnd”:0,”fetchStart”:20639.80000001192,”domainLookupStart”:20639.80000001192,”domainLookupEnd”:20639.80000001192,”connectStart”:20639.80000001192,”secureConnectionStart”:20639.80000001192,”connectEnd”:20639.80000001192,”requestStart”:20641.600000023842,”responseStart”:21198.600000023842,”firstInterimResponseStart”:0,”responseEnd”:21202.80000001192,”transferSize”:7374,”encodedBodySize”:7074,”decodedBodySize”:42581,”responseStatus”:200,”serverTiming”:[],”dns”:0,”tcp”:0,”ttfb”:557,”pathname”:”https://weibo.com/ajax/statuses/buildComments”,”speed”:0}’ s = json.loads(s) s[‘name’] = get_content_1_url s = json.dumps(s) data = f’——WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name=”entry”\r\n\r\n{s}\r\n——WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name=”request_id”\r\n\r\n\r\n——WebKitFormBoundaryNs1Toe4Mbr8n1qXm–\r\n’ response = requests.post(‘https://weibo.com/ajax/log/rum’, cookies=cookies, headers=headers, data=data) return response.textdef get_once_data(uid, mid, the_first=True, max_id=None): respones_1 = get_content_1(uid, mid, the_first, max_id) url = respones_1.url response_2 = get_content_2(url) df = pd.DataFrame(respones_1.json()[‘data’]) max_id = respones_1.json()[‘max_id’] return max_id, dfif __name__ == ‘__main__’: # 先在上面设置cookies # 设置好了再进行操作# 自定义 name = ‘#邹振东诚邀张雪峰来厦门请你吃沙茶面#’ uid = ‘2610806555’ mid = ‘4914095331742409’ page = 100# 初始化 df_list = [] max_id = ”for i in range(page):if i == 0:max_id, df = get_once_data(uid=uid, mid=mid)else:max_id, df = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)if df.shape[0] == 0 or max_id == 0:breakelse:df_list.append(df)print(f’第{i}页解析完毕!max_id:{max_id}‘)df = pd.concat(df_list).astype(str).drop_duplicates() df.to_csv(f’{name}.csv’, index=False)
结束!
二级评论内容
二级评论的流程和一级评论一样,不同的是参数
一级评论的参数
params = {'is_reload': '1','id': f'{mid}','is_show_bulletin': '2','is_mix': '0','count': '20','uid': f'{uid}','fetch_level': '0','locale': 'zh-CN',}
二级评论的参数
params = {'is_reload': '1','id': f'{mid}','is_show_bulletin': '2','is_mix': '1','fetch_level': '1','max_id': '0','count': '20','uid': f'{uid}','locale': 'zh-CN',}
二级评论参数的uid指的是微博主体内容的作者uid,而mid指的是评论者的mid
完整代码如下:
import requestsimport osfrom bs4 import BeautifulSoupimport pandas as pdimport jsonpage_num = 0cookies = { 'SINAGLOBAL': '1278126679099.0298.1694199077980', 'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KMhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt', 'XSRF-TOKEN': '47NC7wE7TMhcqfh1K-4bacK-', 'ALF': '1697384140', 'SSOLoginState': '1694792141', 'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7IJXuI95RLbWORIsozuK4Ohxs_boeOIedEcczDT3uSAI.', 'SUB': '_2A25IAAmdDeRhGeFO61AY8i_NwzyIHXVrdHxVrDV8PUNbmtAGLU74kW9NQYCXlmPtQ1DG4kl_wLzqQqkPl_Do1sZu', '_s_tentry': 'weibo.com', 'Apache': '3760261250067.669.1694792155706', 'ULV': '1694792155740:8:8:4:3760261250067.669.1694792155706:1694767801057', 'WBPSESS': 'X5DJqu8gKpwqYSp80b4XokKvi4u4_oikBqVmvlBCHvGwXMxtKAFxIPg-LIF7foS715Sa4NttSYqzj5x2Ms5ynKVOM5I_Fsy9GECAYh38R4DQ-gq7M5XOe4y1gOUqvm1hOK60dUKvrA5hLuONCL2ing==',}def get_content_1(uid, mid, the_first=True, max_id=None):headers = {'authority': 'weibo.com','accept': 'application/json, text/plain, */*','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','client-version': 'v2.43.32','referer': 'https://weibo.com/1887344341/NhAosFSL4','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '" />,'sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','server-version': 'v2023.09.14.1','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69','x-requested-with': 'XMLHttpRequest','x-xsrf-token': '-UX-uyKz0jmzbTnlkyDEMvSO',}params = {'is_reload': '1','id': f'{mid}','is_show_bulletin': '2','is_mix': '1','fetch_level': '1','max_id': '0','count': '20','uid': f'{uid}','locale': 'zh-CN',}if not the_first:params['flow'] = 0params['max_id'] = max_idelse:passresponse = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers)return responsedef get_content_2(get_content_1_url):headers = {'authority': 'weibo.com','accept': '*/*','accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6','content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm','origin': 'https://weibo.com','referer': 'https://weibo.com/1762257041/NiSAxfmbZ','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69','x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',}s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}'s = json.loads(s)s['name'] = get_content_1_urls = json.dumps(s)data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n'response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data)return response.textdef get_once_data(uid, mid, the_first=True, max_id=None):respones_1 = get_content_1(uid, mid, the_first, max_id)url = respones_1.urlresponse_2 = get_content_2(url)df = pd.DataFrame(respones_1.json()['data'])max_id = respones_1.json()['max_id']return max_id, dfif __name__ == '__main__':# 更新cookies# 得到的一级评论信息df = pd.read_csv('#邹振东诚邀张雪峰来厦门请你吃沙茶面#.csv')# 过滤没有二级评论的一级评论df = df[df['floor_number']>0]os.makedirs('./二级评论数据/', exist_ok=True)for i in range(df.shape[0]): uid = df.iloc[i]['analysis_extra'].replace('|mid:',':').split(':')[1] mid = df.iloc[i]['mid'] page = 100if not os.path.exists(f'./二级评论数据/{mid}-{uid}.csv'): print(f'不存在 ./二级评论数据/{mid}-{uid}.csv') df_list = [] max_id_set = set() max_id = ''for j in range(page): if max_id in max_id_set: break else: max_id_set.add(max_id) if j == 0: max_id, df_ = get_once_data(uid=uid, mid=mid) else: max_id, df_ = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id) if df_.shape[0] == 0 or max_id == 0: break else: df_list.append(df_) print(f'{mid}第{j}页解析完毕!max_id:{max_id}') if df_list: outdf = pd.concat(df_list).astype(str).drop_duplicates() print(f'文件长度为{outdf.shape[0]},文件保存为 ./二级评论数据/{mid}-{uid}.csv') outdf.to_csv(f'./二级评论数据/{mid}-{uid}.csv', index=False) else: pass else: print(f'存在 ./二级评论数据/{mid}-{uid}.csv')
代码运行结果
完成!
问题汇总
csv文件乱码
把 df.to_csv(...)
改为 df.to_csv(..., encoding='utf_8_sig')