4 anos atrás · 637739d427
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
 
				+# 默认忽略的文件
			
 
				+/shelf/
			
 
				+/workspace.xml
			
 
				+# 数据源本地存储已忽略文件
			
 
				+/dataSources/
			
 
				+/dataSources.local.xml
			
 
				+# 基于编辑器的 HTTP 客户端请求
			
 
				+/httpRequests/
			
--- a/.idea/WeiBoCrawler.iml
+++ b/.idea/WeiBoCrawler.iml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<module type="PYTHON_MODULE" version="4">
			
 
				+  <component name="NewModuleRootManager">
			
 
				+    <content url="file://$MODULE_DIR$" />
			
 
				+    <orderEntry type="inheritedJdk" />
			
 
				+    <orderEntry type="sourceFolder" forTests="false" />
			
 
				+  </component>
			
 
				+</module>
			
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,17 @@
 
				+<component name="InspectionProjectProfileManager">
			
 
				+  <profile version="1.0">
			
 
				+    <option name="myName" value="Project Default" />
			
 
				+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
			
 
				+      <option name="ourVersions">
			
 
				+        <value>
			
 
				+          <list size="4">
			
 
				+            <item index="0" class="java.lang.String" itemvalue="2.7" />
			
 
				+            <item index="1" class="java.lang.String" itemvalue="3.7" />
			
 
				+            <item index="2" class="java.lang.String" itemvalue="3.8" />
			
 
				+            <item index="3" class="java.lang.String" itemvalue="3.9" />
			
 
				+          </list>
			
 
				+        </value>
			
 
				+      </option>
			
 
				+    </inspection_tool>
			
 
				+  </profile>
			
 
				+</component>
			
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
 
				+<component name="InspectionProjectProfileManager">
			
 
				+  <settings>
			
 
				+    <option name="USE_PROJECT_PROFILE" value="false" />
			
 
				+    <version value="1.0" />
			
 
				+  </settings>
			
 
				+</component>
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
			
 
				+</project>
			
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ProjectModuleManager">
			
 
				+    <modules>
			
 
				+      <module fileurl="file://$PROJECT_DIR$/.idea/WeiBoCrawler.iml" filepath="$PROJECT_DIR$/.idea/WeiBoCrawler.iml" />
			
 
				+    </modules>
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="VcsDirectoryMappings">
			
 
				+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
			
 
				+  </component>
			
 
				+</project>
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 
				+# 微博爬虫
			
--- a/get_weibo_content/__init__.py
+++ b/get_weibo_content/__init__.py
--- a/get_weibo_content/__pycache__/__init__.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/__init__.cpython-38.pyc
--- a/get_weibo_content/__pycache__/get_content.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/get_content.cpython-38.pyc
--- a/get_weibo_content/__pycache__/get_mid.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/get_mid.cpython-38.pyc
--- a/get_weibo_content/__pycache__/get_one_page.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/get_one_page.cpython-38.pyc
--- a/get_weibo_content/__pycache__/get_time.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/get_time.cpython-38.pyc
--- a/get_weibo_content/__pycache__/process_data.cpython-38.pyc
+++ b/get_weibo_content/__pycache__/process_data.cpython-38.pyc
--- a/get_weibo_content/get_content.py
+++ b/get_weibo_content/get_content.py
@@ -0,0 +1,27 @@
 
				+from lxml import etree
			
 
				+from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_content(html, keyword):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_content = selector.xpath('//div[@class="content"]/p[1]')
			
 
				+    weibo_content_str = []
			
 
				+    count = 0
			
 
				+    flag = []
			
 
				+    if_contains_keyword = False
			
 
				+    for i in weibo_content:
			
 
				+        temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        if '展开全文c' in temp:
			
 
				+            temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count]))
			
 
				+            temp = html_unicode_2_chinese(temp)
			
 
				+            temp = process_str(temp)
			
 
				+            flag.append(count)
			
 
				+        if keyword in temp:
			
 
				+            if_contains_keyword = True
			
 
				+        weibo_content_str.append(temp)
			
 
				+        print(temp)
			
 
				+        count += 1
			
 
				+    print(len(weibo_content_str))
			
 
				+    return weibo_content_str, flag, if_contains_keyword
			
--- a/get_weibo_content/get_mid.py
+++ b/get_weibo_content/get_mid.py
@@ -0,0 +1,14 @@
 
				+from lxml import etree
			
 
				+
			
 
				+
			
 
				+def get_mid(html):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_mid = selector.xpath('//div[@class="card-wrap"]')
			
 
				+    weibo_mid_str = []
			
 
				+    for i in weibo_mid:
			
 
				+        temp = i.get("mid")
			
 
				+        if temp:
			
 
				+            weibo_mid_str.append(temp)
			
 
				+            print(temp)
			
 
				+    print(len(weibo_mid_str))
			
 
				+    return weibo_mid
			
--- a/get_weibo_content/get_one_page.py
+++ b/get_weibo_content/get_one_page.py
@@ -0,0 +1,25 @@
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def get_one_page(keyword, page, date_begin, date_end, proxy_ip, cookie, user_agent):
			
 
				+    user = "lutingsong"
			
 
				+    password = "6cac3hci"
			
 
				+    proxie = {
			
 
				+        'http': 'http://' + user + ':' + password + '@' + proxy_ip,
			
 
				+        'https': 'https://' + user + ':' + password + '@' + proxy_ip
			
 
				+    }
			
 
				+    params = {
			
 
				+        'q': f'{keyword}',
			
 
				+        'typeall': 1,
			
 
				+        'suball': 1,
			
 
				+        'timescope': 'custom:' + date_begin + ':' + date_end,
			
 
				+        'Refer': 'g',
			
 
				+        'page': page,
			
 
				+    }
			
 
				+    headers = {
			
 
				+        'Cookie': cookie,
			
 
				+        'User_Agent': user_agent
			
 
				+    }
			
 
				+    url = 'https://s.weibo.com/weibo?'  # 请求api
			
 
				+    html = requests.get(url, params=params, headers=headers).content
			
 
				+    return html
			
--- a/get_weibo_content/get_time.py
+++ b/get_weibo_content/get_time.py
@@ -0,0 +1,30 @@
 
				+from lxml import etree
			
 
				+from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
			
 
				+
			
 
				+
			
 
				+def get_time(html, flag):
			
 
				+    selector = etree.HTML(html)
			
 
				+    weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
			
 
				+    weibo_time_str = []
			
 
				+    count = 0
			
 
				+    count_s = 0
			
 
				+    for i in weibo_time:
			
 
				+        if count in flag:
			
 
				+            temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
			
 
				+            count_s += 1
			
 
				+        else:
			
 
				+            temp = remove_html_tags(etree.tostring(i))
			
 
				+        temp = html_unicode_2_chinese(temp)
			
 
				+        temp = process_str(temp)
			
 
				+        temp = reprocess_str(temp)
			
 
				+        weibo_time_str.append(temp)
			
 
				+        count += 1
			
 
				+        print(temp)
			
 
				+    print(len(weibo_time_str))
			
 
				+    return weibo_time_str
			
 
				+
			
 
				+
			
 
				+def reprocess_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace(' ', '')
			
 
				+    temp = temp.replace('来自', '')
			
 
				+    return temp
			
--- a/get_weibo_content/process_data.py
+++ b/get_weibo_content/process_data.py
@@ -0,0 +1,21 @@
 
				+import re
			
 
				+from html import unescape
			
 
				+
			
 
				+
			
 
				+# input:str
			
 
				+def remove_html_tags(html_str):
			
 
				+    temp = re.sub(r'<.*?>', '', str(html_str))
			
 
				+    processed_str = temp.replace('\n', '')
			
 
				+    return processed_str
			
 
				+
			
 
				+
			
 
				+# input:str
			
 
				+def html_unicode_2_chinese(html_unicode):
			
 
				+    chinese = unescape(html_unicode)
			
 
				+    return chinese
			
 
				+
			
 
				+
			
 
				+def process_str(unprocessed_str):
			
 
				+    temp = unprocessed_str.replace('  ', '')
			
 
				+    temp = temp.replace('\\n', '')
			
 
				+    return temp
			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,39 @@
 
				+from get_weibo_content.get_one_page import get_one_page
			
 
				+from get_weibo_content.get_content import get_content
			
 
				+from get_weibo_content.get_mid import get_mid
			
 
				+from get_weibo_content.get_time import get_time
			
 
				+import datetime
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
			
 
				+    begin_num = 1
			
 
				+    page_count = begin_num
			
 
				+    while True:
			
 
				+        html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
			
 
				+                            user_agent_temp)
			
 
				+        weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
			
 
				+        if not if_contains_keyword:
			
 
				+            break
			
 
				+        get_mid(html)
			
 
				+        get_time(html, flag)
			
 
				+        time.sleep(10)
			
 
				+        page_count += 1
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    keyword = input('[-]请输入检索话题:')
			
 
				+    date_str = input('[-]请输入需要查询的当天日期(格式：2021-07-01):')
			
 
				+    date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
			
 
				+    # cookie = input('[-]请输入cookie:')
			
 
				+    cookie = 'SUB=_2A25N_5x1DeRhGeBO4lsY9y_Pyz-IHXVvAyQ9rDV8PUJbkNAfLWH8kW1NRYEkPnoC7fl3RMKtu4E9iyGtx5ldeVcn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; SINAGLOBAL=2352245042816.5166.1627033753029; ULV=1627033753032:1:1:1:2352245042816.5166.1627033753029:; UOR=,,graph.qq.com; login_sid_t=5ed1466aa2327aac6d83a6652aa1a60a; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2352245042816.5166.1627033753029; WBtopGlobal_register_version=2021072412; webim_unReadCount=%7B%22time%22%3A1627119847912%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D; appkey=; SSOLoginState=1627122726; wvr=6; WBStorage=2ceabba76d81138d|undefined'
			
 
				+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
			
 
				+    proxy = '127.0.0.1:80'
			
 
				+    for i in range(0, 24):
			
 
				+        date_begin = date_str + '-' + str(i % 24)
			
 
				+        if i == 23:
			
 
				+            date_temp_str = datetime.datetime.strftime(date + datetime.timedelta(days=1), "%Y-%m-%d")
			
 
				+            date_end = date_temp_str + '-' + str((i + 1) % 24)
			
 
				+        else:
			
 
				+            date_end = date_str + '-' + str((i + 1) % 24)
			
 
				+        run_from_time_a_2_time_b(keyword, date_begin, date_end, proxy, cookie, user_agent)