Bläddra i källkod

完成了内容,时间,mid的爬取,按照输入爬取具体某天的信息,按照小时进行任务分配与保存,接下来添加评论数和点赞数的功能

Shellmiao 4 år sedan
förälder
incheckning
4ae76b08ec

+ 0 - 2
get_weibo_content/get_content.py

@@ -21,7 +21,5 @@ def get_content(html, keyword):
         if keyword in temp:
             if_contains_keyword = True
         weibo_content_str.append(temp)
-        print(temp)
         count += 1
-    print(len(weibo_content_str))
     return weibo_content_str, flag, if_contains_keyword

+ 1 - 3
get_weibo_content/get_mid.py

@@ -9,6 +9,4 @@ def get_mid(html):
         temp = i.get("mid")
         if temp:
             weibo_mid_str.append(temp)
-            print(temp)
-    print(len(weibo_mid_str))
-    return weibo_mid
+    return weibo_mid_str

+ 0 - 2
get_weibo_content/get_time.py

@@ -19,8 +19,6 @@ def get_time(html, flag):
         temp = reprocess_str(temp)
         weibo_time_str.append(temp)
         count += 1
-        print(temp)
-    print(len(weibo_time_str))
     return weibo_time_str
 
 

+ 13 - 3
main.py

@@ -2,23 +2,33 @@ from get_weibo_content.get_one_page import get_one_page
 from get_weibo_content.get_content import get_content
 from get_weibo_content.get_mid import get_mid
 from get_weibo_content.get_time import get_time
+from save_data.save_data_to_excel import get_one_page_excel, save_to_excel
 import datetime
 import time
+import random
 
 
 def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
     begin_num = 1
     page_count = begin_num
+    all_data = []
     while True:
+        print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':开始爬取...')
         html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
                             user_agent_temp)
         weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
         if not if_contains_keyword:
             break
-        get_mid(html)
-        get_time(html, flag)
-        time.sleep(10)
+        weibo_mid_str = get_mid(html)
+        weibo_time_str = get_time(html, flag)
+        length = len(weibo_content_str)
+        print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取到' + str(length) + '条信息')
+        one_page_data = get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, length)
+        all_data += one_page_data
+        time.sleep(random.randint(3, 6))
         page_count += 1
+    print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取完毕')
+    save_to_excel(all_data, keyword_temp, date_begin_temp, date_end_temp)
 
 
 if __name__ == '__main__':

+ 0 - 0
save_data/__init__.py


+ 15 - 0
save_data/save_data_to_excel.py

@@ -0,0 +1,15 @@
+import pandas as pd
+
+
+def get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, length):
+    one_page_data = []
+    for i in range(0, length):
+        one_piece_data = (weibo_mid_str[i], weibo_time_str[i], weibo_content_str[i])
+        column_name = ('文章ID', '发文时间', '文章内容')
+        one_page_data.append(dict(zip(column_name, one_piece_data)))
+    return one_page_data
+
+
+def save_to_excel(all_data, keyword, date_begin, date_end):
+    df = pd.DataFrame(all_data)
+    df.to_excel('微博爬取内容-' + keyword + '-' + date_begin + '-' + date_end + '.xlsx', index=False)

BIN
微博爬取内容-吴亦凡-2021-07-23-0-2021-07-23-1.xlsx


BIN
微博爬取内容-吴亦凡-2021-07-23-1-2021-07-23-2.xlsx