from Crawler.Comments import get_comments from Crawler.Page import get_one_page from Processor.CommentProcessor import process_comment from Processor.Content import get_content from Processor.CommentNum import get_comment_count from Processor.Time import get_time from Processor.LikeNum import get_like_count from Processor.Mid import get_mid from Storage.Excel import get_one_page_excel, save_to_excel from lxml import etree import datetime import time import random from Storage.Json import save_2_json from Storage.Utils import Merge def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp): begin_num = 1 page_count = begin_num all_data_excel = [] all_data_json = {} html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp) while True: # try: print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':开始爬取...') html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp) wei_bo_content_str, ifEnd = get_content(html) if ifEnd: break wei_bo_mid_str = get_mid(html) wei_bo_comment_count = get_comment_count(html) wei_bo_like_count = get_like_count(html) wei_bo_time_str = get_time(html, date_begin_temp) wei_bo_comments = [] for wei_bo_mid in wei_bo_mid_str: comment_page_count = 1 res_mid_data = [] while True: print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + '-mid:'+wei_bo_mid+'-page_'+str(comment_page_count)+':开始爬取...') res_data = get_comments( wei_bo_mid, comment_page_count, cookie, user_agent) if(len(res_data) == 0): break else: res_mid_data += res_data comment_page_count += 1 time.sleep(random.randint(3, 6)) wei_bo_comment = process_comment(res_mid_data) wei_bo_comments.append(wei_bo_comment) length = len(wei_bo_content_str) result_one_page = {} for i in range(length): result_one_page[wei_bo_mid_str[i]] = { 'content': wei_bo_content_str[i], 'comment_count': wei_bo_comment_count[i], 'like_count': wei_bo_like_count[i], 'time': wei_bo_time_str[i], 'comments': wei_bo_comments[i] } Merge(result_one_page, all_data_json) print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str( length) + '条信息') one_page_data = get_one_page_excel(wei_bo_content_str, wei_bo_mid_str, wei_bo_time_str, wei_bo_like_count, wei_bo_comment_count, length) all_data_excel += one_page_data time.sleep(random.randint(3, 6)) page_count += 1 # except Exception as e: # print(e) # continue print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取完毕') return all_data_excel, all_data_json if __name__ == '__main__': keyword = input('[-]请输入检索话题:') date_str = input('[-]请输入需要查询的当天日期(格式:2021-07-01):') date = datetime.datetime.strptime(date_str, "%Y-%m-%d") # cookie = input('[-]请输入cookie:') cookie = 'SINAGLOBAL=5651725432098.134.1642487258936; UOR=,,www.google.com.hk; SSOLoginState=1654606657; SUB=_2A25PmzsRDeRhGeBO4lsY9y_Pyz-IHXVtZEVZrDV8PUJbkNAKLUOkkW1NRYEkPlI6BeV0nEOardLZmWDV2bJuQAkj; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; _s_tentry=weibo.com; Apache=8874316633747.783.1654656854407; ULV=1654656854423:4:1:1:8874316633747.783.1654656854407:1646621305826' user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0' proxy = '127.0.0.1:80' data_excel = [] data_json = {} for i in range(0, 23): date_begin = date_str + '-' + str(i % 24) if i == 23: date_temp_str = datetime.datetime.strftime( date + datetime.timedelta(days=1), "%Y-%m-%d") date_end = date_temp_str + '-' + str((i + 1) % 24) else: date_end = date_str + '-' + str((i + 1) % 24) if not cookie: print('请在程序中填写cookie!') break a_2_b_data_excel, a_2_b_data_json = run_from_time_a_2_time_b( keyword, date_begin, date_end, proxy, cookie, user_agent) data_excel += a_2_b_data_excel Merge(a_2_b_data_json, data_json) save_to_excel(data_excel, keyword, date_str) save_2_json(data_json, keyword, date_str)