Browse Source

feat: 完成了对comment的爬取与保存

lutingsong 2 years ago
parent
commit
e15c3427ab

+ 20 - 0
Crawler/Comments.py

@@ -0,0 +1,20 @@
+import requests
+import json
+
+
+def get_comments(id, page, cookie, user_agent):
+    params = {
+        'id': id,
+        'page': page,
+        'moduleID': 'feed',
+        'count': 10 if page == 1 else 20
+    }
+    headers = {
+        'Cookie': cookie,
+        'User_Agent': user_agent
+    }
+    url = 'https://weibo.com/ajax/statuses/repostTimeline?'  # 请求api
+    res = requests.get(url, params=params, headers=headers).content
+    res_json = json.loads(res)
+    res_data = res_json['data']
+    return res_data

BIN
Crawler/__pycache__/Comments.cpython-37.pyc


+ 23 - 2
Main.py

@@ -1,4 +1,6 @@
+from Crawler.Comments import get_comments
 from Crawler.Page import get_one_page
+from Processor.CommentProcessor import process_comment
 from Processor.Content import get_content
 from Processor.CommentNum import get_comment_count
 from Processor.Time import get_time
@@ -34,6 +36,24 @@ def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy
         wei_bo_comment_count = get_comment_count(html)
         wei_bo_like_count = get_like_count(html)
         wei_bo_time_str = get_time(html, date_begin_temp)
+        wei_bo_comments = []
+        for wei_bo_mid in wei_bo_mid_str:
+            comment_page_count = 1
+            res_mid_data = []
+            while True:
+                print('[-](' + date_begin_temp + '——' + date_end_temp +
+                      ')-page_' + str(page_count) + '-mid:'+wei_bo_mid+'-page_'+str(comment_page_count)+':开始爬取...')
+                res_data = get_comments(
+                    wei_bo_mid, comment_page_count, cookie, user_agent)
+                if(len(res_data) == 0):
+                    break
+                else:
+                    res_mid_data += res_data
+                comment_page_count += 1
+                time.sleep(random.randint(3, 6))
+            wei_bo_comment = process_comment(res_mid_data)
+            wei_bo_comments.append(wei_bo_comment)
+
         length = len(wei_bo_content_str)
         result_one_page = {}
         for i in range(length):
@@ -41,7 +61,8 @@ def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy
                 'content': wei_bo_content_str[i],
                 'comment_count': wei_bo_comment_count[i],
                 'like_count': wei_bo_like_count[i],
-                'time': wei_bo_time_str[i]
+                'time': wei_bo_time_str[i],
+                'comments': wei_bo_comments[i]
             }
         Merge(result_one_page, all_data_json)
         print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
@@ -69,7 +90,7 @@ if __name__ == '__main__':
     proxy = '127.0.0.1:80'
     data_excel = []
     data_json = {}
-    for i in range(9, 10):
+    for i in range(0, 23):
         date_begin = date_str + '-' + str(i % 24)
         if i == 23:
             date_temp_str = datetime.datetime.strftime(

+ 58 - 0
Processor/CommentProcessor.py

@@ -0,0 +1,58 @@
+def process_comment(res_data):
+    result = {}
+
+    def add_comment(root, father_user_id, user_id, comment):
+        if father_user_id in root.keys():
+            root[father_user_id]['comments'][user_id] = comment
+            return root
+        for key in root.keys():
+            comments = root[key]['comments']
+            new_comments = add_comment(
+                comments, father_user_id, user_id, comment)
+            if new_comments:
+                root[key]['comments'] = new_comments
+                return root
+        return False
+
+    for data in res_data:
+        comment_id = data['id']
+        host_user_id = data['user']['screen_name']
+        if '//' in data['text_raw']:
+            texts = data['text_raw'].split('//')
+            father_id = ''
+            for text in texts[::-1]:
+                text = text.split(':')
+                if len(text) == 1:
+                    content = text[0] if text[0] == '' else '快转微博'
+                    if father_id == '':
+                        result[host_user_id] = {
+                            'comment_id': comment_id, 'content': content, 'comments': {}
+                        }
+                    else:
+                        result = add_comment(result, father_id, host_user_id, {
+                                             'content': content,
+                                             'comment_id': comment_id,
+                                             'comments': {}
+                                             })
+                else:
+                    user_id = text[0][1:]
+                    content = text[1] if text[1] != '' else '快转微博'
+                    if father_id == '':
+                        if user_id not in result.keys():
+                            result[user_id] = {
+                                'content': content,
+                                'comments': {}
+                            }
+                        father_id = user_id
+                    else:
+                        result = add_comment(result, father_id, user_id, {
+                                             'content': content,
+                                             'comments': {}
+                                             })
+                        father_id = user_id
+        else:
+            content = data['text_raw']
+            result[host_user_id] = {
+                'comment_id': comment_id, 'content': content, 'comments': {}
+            }
+    return result

BIN
Processor/__pycache__/CommentProcessor.cpython-37.pyc


File diff suppressed because it is too large
+ 1 - 0
微博爬取内容-北京疫情-2022-06-06.json


BIN
微博爬取内容-北京疫情-2022-06-06.xlsx


Some files were not shown because too many files changed in this diff