main.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from Crawler.get_weibo_content.get_content import get_content
  2. from Crawler.get_weibo_content.get_one_page import get_one_page
  3. from Crawler.get_weibo_content.get_mid import get_mid
  4. from Crawler.get_weibo_content.get_time import get_time
  5. from Crawler.get_weibo_content.get_comment import get_comment_count
  6. from Crawler.get_weibo_content.get_like import get_like_count
  7. from Crawler.save_data.save_data_to_excel import get_one_page_excel, save_to_excel
  8. import datetime
  9. import time
  10. import random
  11. def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
  12. begin_num = 1
  13. page_count = begin_num
  14. all_data = []
  15. while True:
  16. try:
  17. print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':开始爬取...')
  18. html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
  19. user_agent_temp)
  20. weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
  21. if not if_contains_keyword:
  22. break
  23. weibo_mid_str = get_mid(html)
  24. weibo_comment_count = get_comment_count(html)
  25. weibo_like_count = get_like_count(html)
  26. weibo_time_str = get_time(html, flag, date_begin_temp)
  27. length = len(weibo_content_str)
  28. print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
  29. length) + '条信息')
  30. one_page_data = get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, weibo_like_count,
  31. weibo_comment_count, length)
  32. all_data += one_page_data
  33. time.sleep(random.randint(3, 6))
  34. page_count += 1
  35. except Exception as e:
  36. print(e)
  37. continue
  38. print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取完毕')
  39. save_to_excel(all_data, keyword_temp, date_begin_temp, date_end_temp)
  40. if __name__ == '__main__':
  41. keyword = input('[-]请输入检索话题:')
  42. date_str = input('[-]请输入需要查询的当天日期(格式:2021-07-01):')
  43. date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
  44. # cookie = input('[-]请输入cookie:')
  45. # 测试用cookie
  46. cookie = 'SUB=_2A25N_5x1DeRhGeBO4lsY9y_Pyz-IHXVvAyQ9rDV8PUJbkNAfLWH8kW1NRYEkPnoC7fl3RMKtu4E9iyGtx5ldeVcn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; SINAGLOBAL=2352245042816.5166.1627033753029; ULV=1627033753032:1:1:1:2352245042816.5166.1627033753029:; UOR=,,graph.qq.com; login_sid_t=5ed1466aa2327aac6d83a6652aa1a60a; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2352245042816.5166.1627033753029; WBtopGlobal_register_version=2021072412; webim_unReadCount=%7B%22time%22%3A1627119847912%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D; appkey=; SSOLoginState=1627122726; wvr=6; WBStorage=2ceabba76d81138d|undefined'
  47. # 写上cookie
  48. # cookie = ''
  49. user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
  50. proxy = '127.0.0.1:80'
  51. for i in range(0, 24):
  52. date_begin = date_str + '-' + str(i % 24)
  53. if i == 23:
  54. date_temp_str = datetime.datetime.strftime(date + datetime.timedelta(days=1), "%Y-%m-%d")
  55. date_end = date_temp_str + '-' + str((i + 1) % 24)
  56. else:
  57. date_end = date_str + '-' + str((i + 1) % 24)
  58. if not cookie:
  59. print('请在程序中填写cookie!')
  60. break
  61. run_from_time_a_2_time_b(keyword, date_begin, date_end, proxy, cookie, user_agent)