4 年之前 · 794fcdd009
--- a/get_weibo_content/get_comment.py
+++ b/get_weibo_content/get_comment.py
@@ -0,0 +1,24 @@
 
				+from lxml import etree
			
 
				+from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_comment_count(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_comment_count_temp = selector.xpath('//div[@class="card-act"]/ul/li[3]/a')
			
 
				+    weibo_comment_count = []
			
 
				+    for i in weibo_comment_count_temp:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        temp = reprocess_str(temp)
			
 
				+        if temp:
			
 
				+            weibo_comment_count.append(temp)
			
 
				+        else:
			
 
				+            weibo_comment_count.append('0')
			
 
				+    return weibo_comment_count
			
 
				+
			
 
				+
			
 
				+def reprocess_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace(' ', '')
			
 
				+    temp = temp.replace('评论', '')
			
 
				+    return temp
			
--- a/get_weibo_content/get_like.py
+++ b/get_weibo_content/get_like.py
@@ -0,0 +1,15 @@
 
				+from lxml import etree
			
 
				+from get_weibo_content.process_data import remove_html_tags
			
 
				+
			
 
				+
			
 
				+def get_like_count(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_like_count_temp = selector.xpath('//div[@class="card-act"]/ul/li[4]/a/em')
			
 
				+    weibo_like_count = []
			
 
				+    for i in weibo_like_count_temp:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        if temp:
			
 
				+            weibo_like_count.append(temp)
			
 
				+        else:
			
 
				+            weibo_like_count.append('0')
			
 
				+    return weibo_like_count
			
--- a/get_weibo_content/get_time.py
+++ b/get_weibo_content/get_time.py
@@ -1,8 +1,9 @@
 
				 from lxml import etree
			
 
				+import time
			
 
				 from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				 
			
 
				 
			
 
				-def get_time(html, flag):
			
 
				+def get_time(html, flag, date_begin):
			
 
				     selector = etree.HTML(html)
			
 
				     weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
			
 
				     weibo_time_str = []
			
@@ -17,6 +18,11 @@ def get_time(html, flag):
 
				         temp = html_unicode_2_chinese(temp)
			
 
				         temp = process_str(temp)
			
 
				         temp = reprocess_str(temp)
			
 
				+        if '今天' in temp:
			
 
				+            temp = temp.replace('今天', '-')
			
 
				+            temp = time.strftime("%Y-%m-%d", time.localtime()) + temp
			
 
				+        else:
			
 
				+            temp = date_begin[0:4] + '-' + temp
			
 
				         weibo_time_str.append(temp)
			
 
				         count += 1
			
 
				     return weibo_time_str
			
@@ -25,4 +31,7 @@ def get_time(html, flag):
 
				 def reprocess_str(unprocessed_str):
			
 
				     temp = unprocessed_str.replace(' ', '')
			
 
				     temp = temp.replace('来自', '')
			
 
				+    temp = temp.replace('\xa0', '')
			
 
				+    temp = temp.replace('月', '-')
			
 
				+    temp = temp.replace('日', '-')
			
 
				     return temp
			
--- a/get_weibo_content/process_data.py
+++ b/get_weibo_content/process_data.py
@@ -4,7 +4,7 @@ from html import unescape
 
				 
			
 
				 # input:str
			
 
				 def remove_html_tags(html_str):
			
 
				-    temp = re.sub(r'<.*?>', '', str(html_str))
			
 
				+    temp = re.sub(r'<.*?>', '', html_str.decode("utf-8"))
			
 
				     processed_str = temp.replace('\n', '')
			
 
				     return processed_str
			
 
				 
			
--- a/main.py
+++ b/main.py
@@ -2,6 +2,8 @@ from get_weibo_content.get_one_page import get_one_page
 
				 from get_weibo_content.get_content import get_content
			
 
				 from get_weibo_content.get_mid import get_mid
			
 
				 from get_weibo_content.get_time import get_time
			
 
				+from get_weibo_content.get_comment import get_comment_count
			
 
				+from get_weibo_content.get_like import get_like_count
			
 
				 from save_data.save_data_to_excel import get_one_page_excel, save_to_excel
			
 
				 import datetime
			
 
				 import time
			
@@ -13,21 +15,26 @@ def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy
 
				     page_count = begin_num
			
 
				     all_data = []
			
 
				     while True:
			
 
				-        print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':开始爬取...')
			
 
				+        print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':开始爬取...')
			
 
				         html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
			
 
				                             user_agent_temp)
			
 
				         weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
			
 
				         if not if_contains_keyword:
			
 
				             break
			
 
				         weibo_mid_str = get_mid(html)
			
 
				-        weibo_time_str = get_time(html, flag)
			
 
				+        weibo_comment_count = get_comment_count(html)
			
 
				+        weibo_like_count = get_like_count(html)
			
 
				+        weibo_time_str = get_time(html, flag, date_begin_temp)
			
 
				+
			
 
				         length = len(weibo_content_str)
			
 
				-        print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取到' + str(length) + '条信息')
			
 
				-        one_page_data = get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, length)
			
 
				+        print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
			
 
				+            length) + '条信息')
			
 
				+        one_page_data = get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, weibo_like_count,
			
 
				+                                           weibo_comment_count, length)
			
 
				         all_data += one_page_data
			
 
				         time.sleep(random.randint(3, 6))
			
 
				         page_count += 1
			
 
				-    print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取完毕')
			
 
				+    print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取完毕')
			
 
				     save_to_excel(all_data, keyword_temp, date_begin_temp, date_end_temp)
			
 
				 
			
 
				 
			
@@ -36,7 +43,10 @@ if __name__ == '__main__':
 
				     date_str = input('[-]请输入需要查询的当天日期(格式：2021-07-01):')
			
 
				     date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
			
 
				     # cookie = input('[-]请输入cookie:')
			
 
				-    cookie = 'SUB=_2A25N_5x1DeRhGeBO4lsY9y_Pyz-IHXVvAyQ9rDV8PUJbkNAfLWH8kW1NRYEkPnoC7fl3RMKtu4E9iyGtx5ldeVcn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; SINAGLOBAL=2352245042816.5166.1627033753029; ULV=1627033753032:1:1:1:2352245042816.5166.1627033753029:; UOR=,,graph.qq.com; login_sid_t=5ed1466aa2327aac6d83a6652aa1a60a; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2352245042816.5166.1627033753029; WBtopGlobal_register_version=2021072412; webim_unReadCount=%7B%22time%22%3A1627119847912%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D; appkey=; SSOLoginState=1627122726; wvr=6; WBStorage=2ceabba76d81138d|undefined'
			
 
				+    # 测试用cookie
			
 
				+    # cookie = 'SUB=_2A25N_5x1DeRhGeBO4lsY9y_Pyz-IHXVvAyQ9rDV8PUJbkNAfLWH8kW1NRYEkPnoC7fl3RMKtu4E9iyGtx5ldeVcn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; SINAGLOBAL=2352245042816.5166.1627033753029; ULV=1627033753032:1:1:1:2352245042816.5166.1627033753029:; UOR=,,graph.qq.com; login_sid_t=5ed1466aa2327aac6d83a6652aa1a60a; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2352245042816.5166.1627033753029; WBtopGlobal_register_version=2021072412; webim_unReadCount=%7B%22time%22%3A1627119847912%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D; appkey=; SSOLoginState=1627122726; wvr=6; WBStorage=2ceabba76d81138d|undefined'
			
 
				+    # 写上cookie
			
 
				+    cookie = ''
			
 
				     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
			
 
				     proxy = '127.0.0.1:80'
			
 
				     for i in range(0, 24):
			
@@ -46,4 +56,6 @@ if __name__ == '__main__':
 
				             date_end = date_temp_str + '-' + str((i + 1) % 24)
			
 
				         else:
			
 
				             date_end = date_str + '-' + str((i + 1) % 24)
			
 
				+        if not cookie:
			
 
				+            print('请在程序中填写cookie!')
			
 
				         run_from_time_a_2_time_b(keyword, date_begin, date_end, proxy, cookie, user_agent)
			
--- a/save_data/save_data_to_excel.py
+++ b/save_data/save_data_to_excel.py
@@ -1,11 +1,12 @@
 
				 import pandas as pd
			
 
				 
			
 
				 
			
 
				-def get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, length):
			
 
				+def get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, weibo_like_count, weibo_comment_count, length):
			
 
				     one_page_data = []
			
 
				     for i in range(0, length):
			
 
				-        one_piece_data = (weibo_mid_str[i], weibo_time_str[i], weibo_content_str[i])
			
 
				-        column_name = ('文章ID', '发文时间', '文章内容')
			
 
				+        one_piece_data = (
			
 
				+            weibo_mid_str[i], weibo_time_str[i], weibo_content_str[i], weibo_like_count[i], weibo_comment_count[i])
			
 
				+        column_name = ('文章ID', '发文时间', '文章内容', '点赞数', '评论数')
			
 
				         one_page_data.append(dict(zip(column_name, one_piece_data)))
			
 
				     return one_page_data
			
 
				 
			
--- a/微博爬取内容-吴亦凡-2021-07-15.zip
+++ b/微博爬取内容-吴亦凡-2021-07-15.zip
--- a/微博爬取内容-吴亦凡-2021-07-23-0-2021-07-23-1.xlsx
+++ b/微博爬取内容-吴亦凡-2021-07-23-0-2021-07-23-1.xlsx
--- a/微博爬取内容-吴亦凡-2021-07-23-1-2021-07-23-2.xlsx
+++ b/微博爬取内容-吴亦凡-2021-07-23-1-2021-07-23-2.xlsx