|
@@ -0,0 +1,88 @@
|
|
|
+from Crawler.Page import get_one_page
|
|
|
+from Processor.Content import get_content
|
|
|
+from Processor.CommentNum import get_comment_count
|
|
|
+from Processor.Time import get_time
|
|
|
+from Processor.LikeNum import get_like_count
|
|
|
+from Processor.Mid import get_mid
|
|
|
+from Storage.Excel import get_one_page_excel, save_to_excel
|
|
|
+from lxml import etree
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+import random
|
|
|
+from Storage.Json import save_2_json
|
|
|
+
|
|
|
+from Storage.Utils import Merge
|
|
|
+
|
|
|
+
|
|
|
+def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
|
|
|
+ begin_num = 1
|
|
|
+ page_count = begin_num
|
|
|
+ all_data_excel = []
|
|
|
+ all_data_json = {}
|
|
|
+ html = get_one_page(keyword_temp, page_count, date_begin_temp,
|
|
|
+ date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
|
|
|
+ while True:
|
|
|
+ # try:
|
|
|
+ print('[-](' + date_begin_temp + '——' + date_end_temp +
|
|
|
+ ')-page_' + str(page_count) + ':开始爬取...')
|
|
|
+ html = get_one_page(keyword_temp, page_count, date_begin_temp,
|
|
|
+ date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
|
|
|
+ wei_bo_content_str, ifEnd = get_content(html)
|
|
|
+ if ifEnd:
|
|
|
+ break
|
|
|
+ wei_bo_mid_str = get_mid(html)
|
|
|
+ wei_bo_comment_count = get_comment_count(html)
|
|
|
+ wei_bo_like_count = get_like_count(html)
|
|
|
+ wei_bo_time_str = get_time(html, date_begin_temp)
|
|
|
+ length = len(wei_bo_content_str)
|
|
|
+ result_one_page = {}
|
|
|
+ for i in range(length):
|
|
|
+ result_one_page[wei_bo_mid_str[i]] = {
|
|
|
+ 'content': wei_bo_content_str[i],
|
|
|
+ 'comment_count': wei_bo_comment_count[i],
|
|
|
+ 'like_count': wei_bo_like_count[i],
|
|
|
+ 'time': wei_bo_time_str[i]
|
|
|
+ }
|
|
|
+ Merge(result_one_page, all_data_json)
|
|
|
+ print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
|
|
|
+ length) + '条信息')
|
|
|
+ one_page_data = get_one_page_excel(wei_bo_content_str, wei_bo_mid_str, wei_bo_time_str, wei_bo_like_count,
|
|
|
+ wei_bo_comment_count, length)
|
|
|
+ all_data_excel += one_page_data
|
|
|
+ time.sleep(random.randint(3, 6))
|
|
|
+ page_count += 1
|
|
|
+ # except Exception as e:
|
|
|
+ # print(e)
|
|
|
+ # continue
|
|
|
+ print('[-](' + date_begin_temp + '——' + date_end_temp +
|
|
|
+ ')-page_' + str(page_count) + ':爬取完毕')
|
|
|
+ return all_data_excel, all_data_json
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ keyword = input('[-]请输入检索话题:')
|
|
|
+ date_str = input('[-]请输入需要查询的当天日期(格式:2021-07-01):')
|
|
|
+ date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
|
|
+ # cookie = input('[-]请输入cookie:')
|
|
|
+ cookie = 'SINAGLOBAL=5651725432098.134.1642487258936; UOR=,,www.google.com.hk; SSOLoginState=1654606657; SUB=_2A25PmzsRDeRhGeBO4lsY9y_Pyz-IHXVtZEVZrDV8PUJbkNAKLUOkkW1NRYEkPlI6BeV0nEOardLZmWDV2bJuQAkj; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; _s_tentry=weibo.com; Apache=8874316633747.783.1654656854407; ULV=1654656854423:4:1:1:8874316633747.783.1654656854407:1646621305826'
|
|
|
+ user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
|
|
|
+ proxy = '127.0.0.1:80'
|
|
|
+ data_excel = []
|
|
|
+ data_json = {}
|
|
|
+ for i in range(9, 10):
|
|
|
+ date_begin = date_str + '-' + str(i % 24)
|
|
|
+ if i == 23:
|
|
|
+ date_temp_str = datetime.datetime.strftime(
|
|
|
+ date + datetime.timedelta(days=1), "%Y-%m-%d")
|
|
|
+ date_end = date_temp_str + '-' + str((i + 1) % 24)
|
|
|
+ else:
|
|
|
+ date_end = date_str + '-' + str((i + 1) % 24)
|
|
|
+ if not cookie:
|
|
|
+ print('请在程序中填写cookie!')
|
|
|
+ break
|
|
|
+ a_2_b_data_excel, a_2_b_data_json = run_from_time_a_2_time_b(
|
|
|
+ keyword, date_begin, date_end, proxy, cookie, user_agent)
|
|
|
+ data_excel += a_2_b_data_excel
|
|
|
+ Merge(a_2_b_data_json, data_json)
|
|
|
+ save_to_excel(data_excel, keyword, date_str)
|
|
|
+ save_2_json(data_json, keyword, date_str)
|