hace 2 años · 8285208e13
--- a/.DS_Store
+++ b/.DS_Store
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 
				+{
			
 
				+    // 使用 IntelliSense 了解相关属性。 
			
 
				+    // 悬停以查看现有属性的描述。
			
 
				+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
			
 
				+    "version": "0.2.0",
			
 
				+    "configurations": [
			
 
				+        {
			
 
				+            "name": "Python: 当前文件",
			
 
				+            "type": "python",
			
 
				+            "request": "launch",
			
 
				+            "program": "${file}",
			
 
				+            "console": "integratedTerminal",
			
 
				+            "justMyCode": true
			
 
				+        }
			
 
				+    ]
			
 
				+}
			
--- a/Crawler/Page.py
+++ b/Crawler/Page.py
@@ -0,0 +1,20 @@
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def get_one_page(keyword, page, date_begin, date_end, proxy_ip, cookie, user_agent):
			
 
				+    params = {
			
 
				+        'q': f'{keyword}',
			
 
				+        'xsort': 'hot',
			
 
				+        'typeall': 1,
			
 
				+        'suball': 1,
			
 
				+        'timescope': 'custom:' + date_begin + ':' + date_end,
			
 
				+        'Refer': 'g',
			
 
				+        'page': page,
			
 
				+    }
			
 
				+    headers = {
			
 
				+        'Cookie': cookie,
			
 
				+        'User_Agent': user_agent
			
 
				+    }
			
 
				+    url = 'https://s.weibo.com/weibo?'  # 请求api
			
 
				+    html = requests.get(url, params=params, headers=headers).content
			
 
				+    return html
			
--- a/Crawler/__init__.py
+++ b/Crawler/__init__.py
--- a/Crawler/__pycache__/Page.cpython-37.pyc
+++ b/Crawler/__pycache__/Page.cpython-37.pyc
--- a/Crawler/__pycache__/__init__.cpython-37.pyc
+++ b/Crawler/__pycache__/__init__.cpython-37.pyc
--- a/Main.py
+++ b/Main.py
@@ -0,0 +1,88 @@
 
				+from Crawler.Page import get_one_page
			
 
				+from Processor.Content import get_content
			
 
				+from Processor.CommentNum import get_comment_count
			
 
				+from Processor.Time import get_time
			
 
				+from Processor.LikeNum import get_like_count
			
 
				+from Processor.Mid import get_mid
			
 
				+from Storage.Excel import get_one_page_excel, save_to_excel
			
 
				+from lxml import etree
			
 
				+import datetime
			
 
				+import time
			
 
				+import random
			
 
				+from Storage.Json import save_2_json
			
 
				+
			
 
				+from Storage.Utils import Merge
			
 
				+
			
 
				+
			
 
				+def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
			
 
				+    begin_num = 1
			
 
				+    page_count = begin_num
			
 
				+    all_data_excel = []
			
 
				+    all_data_json = {}
			
 
				+    html = get_one_page(keyword_temp, page_count, date_begin_temp,
			
 
				+                        date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
			
 
				+    while True:
			
 
				+        # try:
			
 
				+        print('[-](' + date_begin_temp + '——' + date_end_temp +
			
 
				+              ')-page_' + str(page_count) + ':开始爬取...')
			
 
				+        html = get_one_page(keyword_temp, page_count, date_begin_temp,
			
 
				+                            date_end_temp, proxy_temp, cookie_temp, user_agent_temp)
			
 
				+        wei_bo_content_str,  ifEnd = get_content(html)
			
 
				+        if ifEnd:
			
 
				+            break
			
 
				+        wei_bo_mid_str = get_mid(html)
			
 
				+        wei_bo_comment_count = get_comment_count(html)
			
 
				+        wei_bo_like_count = get_like_count(html)
			
 
				+        wei_bo_time_str = get_time(html, date_begin_temp)
			
 
				+        length = len(wei_bo_content_str)
			
 
				+        result_one_page = {}
			
 
				+        for i in range(length):
			
 
				+            result_one_page[wei_bo_mid_str[i]] = {
			
 
				+                'content': wei_bo_content_str[i],
			
 
				+                'comment_count': wei_bo_comment_count[i],
			
 
				+                'like_count': wei_bo_like_count[i],
			
 
				+                'time': wei_bo_time_str[i]
			
 
				+            }
			
 
				+        Merge(result_one_page, all_data_json)
			
 
				+        print('[-](' + date_begin_temp + '——' + date_end_temp + ')-page_' + str(page_count) + ':爬取到' + str(
			
 
				+            length) + '条信息')
			
 
				+        one_page_data = get_one_page_excel(wei_bo_content_str, wei_bo_mid_str, wei_bo_time_str, wei_bo_like_count,
			
 
				+                                           wei_bo_comment_count, length)
			
 
				+        all_data_excel += one_page_data
			
 
				+        time.sleep(random.randint(3, 6))
			
 
				+        page_count += 1
			
 
				+        # except Exception as e:
			
 
				+        #     print(e)
			
 
				+        #     continue
			
 
				+    print('[-](' + date_begin_temp + '——' + date_end_temp +
			
 
				+          ')-page_' + str(page_count) + ':爬取完毕')
			
 
				+    return all_data_excel, all_data_json
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    keyword = input('[-]请输入检索话题:')
			
 
				+    date_str = input('[-]请输入需要查询的当天日期(格式：2021-07-01):')
			
 
				+    date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
			
 
				+    # cookie = input('[-]请输入cookie:')
			
 
				+    cookie = 'SINAGLOBAL=5651725432098.134.1642487258936; UOR=,,www.google.com.hk; SSOLoginState=1654606657; SUB=_2A25PmzsRDeRhGeBO4lsY9y_Pyz-IHXVtZEVZrDV8PUJbkNAKLUOkkW1NRYEkPlI6BeV0nEOardLZmWDV2bJuQAkj; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; _s_tentry=weibo.com; Apache=8874316633747.783.1654656854407; ULV=1654656854423:4:1:1:8874316633747.783.1654656854407:1646621305826'
			
 
				+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
			
 
				+    proxy = '127.0.0.1:80'
			
 
				+    data_excel = []
			
 
				+    data_json = {}
			
 
				+    for i in range(9, 10):
			
 
				+        date_begin = date_str + '-' + str(i % 24)
			
 
				+        if i == 23:
			
 
				+            date_temp_str = datetime.datetime.strftime(
			
 
				+                date + datetime.timedelta(days=1), "%Y-%m-%d")
			
 
				+            date_end = date_temp_str + '-' + str((i + 1) % 24)
			
 
				+        else:
			
 
				+            date_end = date_str + '-' + str((i + 1) % 24)
			
 
				+        if not cookie:
			
 
				+            print('请在程序中填写cookie!')
			
 
				+            break
			
 
				+        a_2_b_data_excel, a_2_b_data_json = run_from_time_a_2_time_b(
			
 
				+            keyword, date_begin, date_end, proxy, cookie, user_agent)
			
 
				+        data_excel += a_2_b_data_excel
			
 
				+        Merge(a_2_b_data_json, data_json)
			
 
				+    save_to_excel(data_excel, keyword, date_str)
			
 
				+    save_2_json(data_json, keyword, date_str)
			
--- a/Processor/CommentNum.py
+++ b/Processor/CommentNum.py
@@ -0,0 +1,25 @@
 
				+from lxml import etree
			
 
				+from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_comment_count(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_comment_count_temp = selector.xpath(
			
 
				+        '//div[@class="card-act"]/ul/li[2]/a')
			
 
				+    weibo_comment_count = []
			
 
				+    for i in weibo_comment_count_temp:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        temp = reprocess_str(temp)
			
 
				+        if temp:
			
 
				+            weibo_comment_count.append(temp)
			
 
				+        else:
			
 
				+            weibo_comment_count.append('0')
			
 
				+    return weibo_comment_count
			
 
				+
			
 
				+
			
 
				+def reprocess_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace(' ', '')
			
 
				+    temp = temp.replace('评论', '')
			
 
				+    return temp
			
--- a/Processor/Content.py
+++ b/Processor/Content.py
@@ -0,0 +1,28 @@
 
				+from lxml import etree
			
 
				+from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_content(html):
			
 
				+    # 初始化
			
 
				+    selector = etree.HTML(html)
			
 
				+    wei_bo_content_str = []
			
 
				+    count = 0
			
 
				+    # 检查是否已经结束
			
 
				+    notFound = selector.xpath('//div[@class="card card-no-result s-pt20b40"]')
			
 
				+    ifEnd = False
			
 
				+    if len(notFound) != 0:
			
 
				+        ifEnd = True
			
 
				+        return wei_bo_content_str, ifEnd
			
 
				+    # 分析数据内容
			
 
				+    wei_bo_content = selector.xpath('//div[@class="content"]')
			
 
				+    for i in wei_bo_content:
			
 
				+        temp = remove_html_tags(etree.tostring(i.xpath("p[2]")[0]))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        if '展开c' in temp:
			
 
				+            temp = remove_html_tags(etree.tostring(i.xpath("p[3]")[0]))
			
 
				+            temp = html_unicode_2_chinese(temp)
			
 
				+            temp = process_str(temp)
			
 
				+        wei_bo_content_str.append(temp)
			
 
				+        count += 1
			
 
				+    return wei_bo_content_str, ifEnd
			
--- a/Processor/LikeNum.py
+++ b/Processor/LikeNum.py
@@ -0,0 +1,18 @@
 
				+from lxml import etree
			
 
				+from .ProcessTool import html_unicode_2_chinese, process_str, remove_html_tags
			
 
				+
			
 
				+
			
 
				+def get_like_count(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    wei_bo_like_count_temp = selector.xpath(
			
 
				+        '//div[@class="card-act"]/ul/li[3]/a/button/span[2]')
			
 
				+    wei_bo_like_count = []
			
 
				+    for i in wei_bo_like_count_temp:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        if temp == '赞':
			
 
				+            wei_bo_like_count.append('0')
			
 
				+        else:
			
 
				+            wei_bo_like_count.append(temp)
			
 
				+    return wei_bo_like_count
			
--- a/Processor/Mid.py
+++ b/Processor/Mid.py
@@ -0,0 +1,12 @@
 
				+from lxml import etree
			
 
				+
			
 
				+
			
 
				+def get_mid(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    mid = selector.xpath('//div[@class="card-wrap"]')
			
 
				+    mid_str = []
			
 
				+    for i in mid:
			
 
				+        temp = i.get("mid")
			
 
				+        if temp:
			
 
				+            mid_str.append(temp)
			
 
				+    return mid_str
			
--- a/Processor/ProcessTool.py
+++ b/Processor/ProcessTool.py
@@ -0,0 +1,19 @@
 
				+import re
			
 
				+from html import unescape
			
 
				+
			
 
				+
			
 
				+def remove_html_tags(html_str):
			
 
				+    temp = re.sub(r'<.*?>', '', html_str.decode("utf-8"))
			
 
				+    processed_str = temp.replace('\n', '')
			
 
				+    return processed_str
			
 
				+
			
 
				+
			
 
				+def html_unicode_2_chinese(html_unicode):
			
 
				+    chinese = unescape(html_unicode)
			
 
				+    return chinese
			
 
				+
			
 
				+
			
 
				+def process_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace('  ', '')
			
 
				+    temp = temp.replace('\\n', '')
			
 
				+    return temp
			
--- a/Processor/Time.py
+++ b/Processor/Time.py
@@ -0,0 +1,30 @@
 
				+from lxml import etree
			
 
				+import time
			
 
				+from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_time(html, date_begin):
			
 
				+    selector = etree.HTML(html)
			
 
				+    wei_bo_time = selector.xpath('//div[@class="content"]/p[1]/a[1]')
			
 
				+    wei_bo_time_str = []
			
 
				+    for i in wei_bo_time:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        temp = reprocess_str(temp)
			
 
				+        if '今天' in temp:
			
 
				+            temp = temp.replace('今天', ' ')
			
 
				+            temp = time.strftime("%Y-%m-%d", time.localtime()) + temp
			
 
				+        else:
			
 
				+            temp = date_begin[0:4] + '-' + temp
			
 
				+        wei_bo_time_str.append(temp)
			
 
				+    return wei_bo_time_str
			
 
				+
			
 
				+
			
 
				+def reprocess_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace(' ', '')
			
 
				+    temp = temp.replace('来自', '')
			
 
				+    temp = temp.replace('\xa0', '')
			
 
				+    temp = temp.replace('月', '-')
			
 
				+    temp = temp.replace('日', '-')
			
 
				+    return temp
			
--- a/Processor/__init__.py
+++ b/Processor/__init__.py
--- a/Processor/__pycache__/CommentNum.cpython-37.pyc
+++ b/Processor/__pycache__/CommentNum.cpython-37.pyc
--- a/Processor/__pycache__/Content.cpython-37.pyc
+++ b/Processor/__pycache__/Content.cpython-37.pyc
--- a/Processor/__pycache__/LikeNum.cpython-37.pyc
+++ b/Processor/__pycache__/LikeNum.cpython-37.pyc
--- a/Processor/__pycache__/Mid.cpython-37.pyc
+++ b/Processor/__pycache__/Mid.cpython-37.pyc
--- a/Processor/__pycache__/ProcessTool.cpython-37.pyc
+++ b/Processor/__pycache__/ProcessTool.cpython-37.pyc
--- a/Processor/__pycache__/Time.cpython-37.pyc
+++ b/Processor/__pycache__/Time.cpython-37.pyc
--- a/Processor/__pycache__/__init__.cpython-37.pyc
+++ b/Processor/__pycache__/__init__.cpython-37.pyc
--- a/Storage/Excel.py
+++ b/Storage/Excel.py
@@ -0,0 +1,16 @@
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def get_one_page_excel(weibo_content_str, weibo_mid_str, weibo_time_str, weibo_like_count, weibo_comment_count, length):
			
 
				+    one_page_data = []
			
 
				+    for i in range(0, length):
			
 
				+        one_piece_data = (
			
 
				+            weibo_mid_str[i], weibo_time_str[i], weibo_content_str[i], weibo_like_count[i], weibo_comment_count[i])
			
 
				+        column_name = ('文章ID', '发文时间', '文章内容', '点赞数', '评论数')
			
 
				+        one_page_data.append(dict(zip(column_name, one_piece_data)))
			
 
				+    return one_page_data
			
 
				+
			
 
				+
			
 
				+def save_to_excel(all_data, keyword, date):
			
 
				+    df = pd.DataFrame(all_data)
			
 
				+    df.to_excel('微博爬取内容-' + keyword + '-' + date + '.xlsx', index=False)
			
--- a/Storage/Json.py
+++ b/Storage/Json.py
@@ -0,0 +1,8 @@
 
				+import json
			
 
				+
			
 
				+
			
 
				+def save_2_json(data, keyword, date):
			
 
				+    json_data = json.dumps(data, ensure_ascii=False)
			
 
				+    file = open('微博爬取内容-' + keyword + '-' + date + '.json', 'w')
			
 
				+    file.write(json_data)
			
 
				+    file.close()
			
--- a/Storage/Utils.py
+++ b/Storage/Utils.py
@@ -0,0 +1,2 @@
 
				+def Merge(dict1, dict2):
			
 
				+    return(dict2.update(dict1))
			
--- a/Storage/__pycache__/Excel.cpython-37.pyc
+++ b/Storage/__pycache__/Excel.cpython-37.pyc
--- a/Storage/__pycache__/Json.cpython-37.pyc
+++ b/Storage/__pycache__/Json.cpython-37.pyc
--- a/Storage/__pycache__/Utils.cpython-37.pyc
+++ b/Storage/__pycache__/Utils.cpython-37.pyc
--- a/微博爬取内容-北京疫情-2022-06-06.json
+++ b/微博爬取内容-北京疫情-2022-06-06.json
--- a/微博爬取内容-北京疫情-2022-06-06.xlsx
+++ b/微博爬取内容-北京疫情-2022-06-06.xlsx