Main.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. from Crawler.Comments import get_comments
  2. from Crawler.Page import get_one_page
  3. from Processor.CommentProcessor import process_comment
  4. from Processor.Content import get_content
  5. from Processor.CommentNum import get_comment_count
  6. from Processor.Time import get_time
  7. from Processor.LikeNum import get_like_count
  8. from Processor.Mid import get_mid
  9. from Storage.Excel import get_one_page_excel, save_to_excel
  10. from lxml import etree
  11. import datetime
  12. import time
  13. import random
  14. from Storage.Json import save_2_json
  15. from Storage.Utils import Merge
  16. def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
  17. begin_num = 1
  18. page_count = begin_num
  19. all_data_excel = []
  20. all_data_json = {}
  21. html = get_one_page(keyword_temp, page_count, date_begin_temp,
  22. date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
  23. while True:
  24. # try:
  25. print('[-](' + date_begin_temp + '——' + date_end_temp +
  26. ')-page_' + str(page_count) + ':开始爬取...')
  27. html = get_one_page(keyword_temp, page_count, date_begin_temp,
  28. date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
  29. wei_bo_content_str, ifEnd = get_content(html)
  30. if ifEnd:
  31. break
  32. wei_bo_mid_str = get_mid(html)
  33. wei_bo_comment_count = get_comment_count(html)
  34. wei_bo_like_count = get_like_count(html)
  35. wei_bo_time_str = get_time(html, date_begin_temp)
  36. wei_bo_comments = []
  37. for wei_bo_mid in wei_bo_mid_str:
  38. comment_page_count = 1
  39. res_mid_data = []
  40. while True:
  41. print('[-](' + date_begin_temp + '——' + date_end_temp +
  42. ')-page_' + str(page_count) + '-mid:'+wei_bo_mid+'-page_'+str(comment_page_count)+':开始爬取...')
  43. res_data = get_comments(
  44. wei_bo_mid, comment_page_count, cookie, user_agent)
  45. if(len(res_data) == 0):
  46. break
  47. else:
  48. res_mid_data += res_data
  49. comment_page_count += 1
  50. time.sleep(random.randint(3, 6))
  51. wei_bo_comment = process_comment(res_mid_data)
  52. wei_bo_comments.append(wei_bo_comment)
  53. length = len(wei_bo_content_str)
  54. result_one_page = {}
  55. for i in range(length):
  56. result_one_page[wei_bo_mid_str[i]] = {
  57. 'content': wei_bo_content_str[i],
  58. 'comment_count': wei_bo_comment_count[i],
  59. 'like_count': wei_bo_like_count[i],
  60. 'time': wei_bo_time_str[i],
  61. 'comments': wei_bo_comments[i]
  62. }
  63. Merge(result_one_page, all_data_json)
  64. print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
  65. length) + '条信息')
  66. one_page_data = get_one_page_excel(wei_bo_content_str, wei_bo_mid_str, wei_bo_time_str, wei_bo_like_count,
  67. wei_bo_comment_count, length)
  68. all_data_excel += one_page_data
  69. time.sleep(random.randint(3, 6))
  70. page_count += 1
  71. # except Exception as e:
  72. # print(e)
  73. # continue
  74. print('[-](' + date_begin_temp + '——' + date_end_temp +
  75. ')-page_' + str(page_count) + ':爬取完毕')
  76. return all_data_excel, all_data_json
  77. if __name__ == '__main__':
  78. keyword = input('[-]请输入检索话题:')
  79. date_str = input('[-]请输入需要查询的当天日期(格式:2021-07-01):')
  80. date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
  81. # cookie = input('[-]请输入cookie:')
  82. cookie = 'SINAGLOBAL=5651725432098.134.1642487258936; UOR=,,www.google.com.hk; SSOLoginState=1654606657; SUB=_2A25PmzsRDeRhGeBO4lsY9y_Pyz-IHXVtZEVZrDV8PUJbkNAKLUOkkW1NRYEkPlI6BeV0nEOardLZmWDV2bJuQAkj; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; _s_tentry=weibo.com; Apache=8874316633747.783.1654656854407; ULV=1654656854423:4:1:1:8874316633747.783.1654656854407:1646621305826'
  83. user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
  84. proxy = '127.0.0.1:80'
  85. data_excel = []
  86. data_json = {}
  87. for i in range(0, 23):
  88. date_begin = date_str + '-' + str(i % 24)
  89. if i == 23:
  90. date_temp_str = datetime.datetime.strftime(
  91. date + datetime.timedelta(days=1), "%Y-%m-%d")
  92. date_end = date_temp_str + '-' + str((i + 1) % 24)
  93. else:
  94. date_end = date_str + '-' + str((i + 1) % 24)
  95. if not cookie:
  96. print('请在程序中填写cookie!')
  97. break
  98. a_2_b_data_excel, a_2_b_data_json = run_from_time_a_2_time_b(
  99. keyword, date_begin, date_end, proxy, cookie, user_agent)
  100. data_excel += a_2_b_data_excel
  101. Merge(a_2_b_data_json, data_json)
  102. save_to_excel(data_excel, keyword, date_str)
  103. save_2_json(data_json, keyword, date_str)