|
@@ -1,4 +1,6 @@
|
|
|
+from Crawler.Comments import get_comments
|
|
|
from Crawler.Page import get_one_page
|
|
|
+from Processor.CommentProcessor import process_comment
|
|
|
from Processor.Content import get_content
|
|
|
from Processor.CommentNum import get_comment_count
|
|
|
from Processor.Time import get_time
|
|
@@ -34,6 +36,24 @@ def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy
|
|
|
wei_bo_comment_count = get_comment_count(html)
|
|
|
wei_bo_like_count = get_like_count(html)
|
|
|
wei_bo_time_str = get_time(html, date_begin_temp)
|
|
|
+ wei_bo_comments = []
|
|
|
+ for wei_bo_mid in wei_bo_mid_str:
|
|
|
+ comment_page_count = 1
|
|
|
+ res_mid_data = []
|
|
|
+ while True:
|
|
|
+ print('[-](' + date_begin_temp + '——' + date_end_temp +
|
|
|
+ ')-page_' + str(page_count) + '-mid:'+wei_bo_mid+'-page_'+str(comment_page_count)+':开始爬取...')
|
|
|
+ res_data = get_comments(
|
|
|
+ wei_bo_mid, comment_page_count, cookie, user_agent)
|
|
|
+ if(len(res_data) == 0):
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ res_mid_data += res_data
|
|
|
+ comment_page_count += 1
|
|
|
+ time.sleep(random.randint(3, 6))
|
|
|
+ wei_bo_comment = process_comment(res_mid_data)
|
|
|
+ wei_bo_comments.append(wei_bo_comment)
|
|
|
+
|
|
|
length = len(wei_bo_content_str)
|
|
|
result_one_page = {}
|
|
|
for i in range(length):
|
|
@@ -41,7 +61,8 @@ def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy
|
|
|
'content': wei_bo_content_str[i],
|
|
|
'comment_count': wei_bo_comment_count[i],
|
|
|
'like_count': wei_bo_like_count[i],
|
|
|
- 'time': wei_bo_time_str[i]
|
|
|
+ 'time': wei_bo_time_str[i],
|
|
|
+ 'comments': wei_bo_comments[i]
|
|
|
}
|
|
|
Merge(result_one_page, all_data_json)
|
|
|
print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
|
|
@@ -69,7 +90,7 @@ if __name__ == '__main__':
|
|
|
proxy = '127.0.0.1:80'
|
|
|
data_excel = []
|
|
|
data_json = {}
|
|
|
- for i in range(9, 10):
|
|
|
+ for i in range(0, 23):
|
|
|
date_begin = date_str + '-' + str(i % 24)
|
|
|
if i == 23:
|
|
|
date_temp_str = datetime.datetime.strftime(
|