|
|
@@ -2,23 +2,33 @@ from get_weibo_content.get_one_page import get_one_page
|
|
|
from get_weibo_content.get_content import get_content
|
|
|
from get_weibo_content.get_mid import get_mid
|
|
|
from get_weibo_content.get_time import get_time
|
|
|
+from save_data.save_data_to_excel import get_one_page_excel, save_to_excel
|
|
|
import datetime
|
|
|
import time
|
|
|
+import random
|
|
|
|
|
|
|
|
|
def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
|
|
|
begin_num = 1
|
|
|
page_count = begin_num
|
|
|
+ all_data = []
|
|
|
while True:
|
|
|
+ print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':开始爬取...')
|
|
|
html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
|
|
|
user_agent_temp)
|
|
|
weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
|
|
|
if not if_contains_keyword:
|
|
|
break
|
|
|
- get_mid(html)
|
|
|
- get_time(html, flag)
|
|
|
- time.sleep(10)
|
|
|
+ weibo_mid_str = get_mid(html)
|
|
|
+ weibo_time_str = get_time(html, flag)
|
|
|
+ length = len(weibo_content_str)
|
|
|
+ print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取到' + str(length) + '条信息')
|
|
|
+ one_page_data = get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, length)
|
|
|
+ all_data += one_page_data
|
|
|
+ time.sleep(random.randint(3, 6))
|
|
|
page_count += 1
|
|
|
+ print('[-]' + date_begin_temp + '-' + date_end_temp + '-page_' + str(page_count) + ':爬取完毕')
|
|
|
+ save_to_excel(all_data, keyword_temp, date_begin_temp, date_end_temp)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|