Ver código fonte

first commit

Shellmiao 4 anos atrás
commit
637739d427

+ 8 - 0
.idea/.gitignore

@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 数据源本地存储已忽略文件
+/dataSources/
+/dataSources.local.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/

+ 8 - 0
.idea/WeiBoCrawler.iml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

+ 17 - 0
.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,17 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ourVersions">
+        <value>
+          <list size="4">
+            <item index="0" class="java.lang.String" itemvalue="2.7" />
+            <item index="1" class="java.lang.String" itemvalue="3.7" />
+            <item index="2" class="java.lang.String" itemvalue="3.8" />
+            <item index="3" class="java.lang.String" itemvalue="3.9" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/WeiBoCrawler.iml" filepath="$PROJECT_DIR$/.idea/WeiBoCrawler.iml" />
+    </modules>
+  </component>
+</project>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 1 - 0
README.md

@@ -0,0 +1 @@
+# 微博爬虫

+ 0 - 0
get_weibo_content/__init__.py


BIN
get_weibo_content/__pycache__/__init__.cpython-38.pyc


BIN
get_weibo_content/__pycache__/get_content.cpython-38.pyc


BIN
get_weibo_content/__pycache__/get_mid.cpython-38.pyc


BIN
get_weibo_content/__pycache__/get_one_page.cpython-38.pyc


BIN
get_weibo_content/__pycache__/get_time.cpython-38.pyc


BIN
get_weibo_content/__pycache__/process_data.cpython-38.pyc


+ 27 - 0
get_weibo_content/get_content.py

@@ -0,0 +1,27 @@
+from lxml import etree
+from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
+
+
+def get_content(html, keyword):
+    selector = etree.HTML(html)
+    weibo_content = selector.xpath('//div[@class="content"]/p[1]')
+    weibo_content_str = []
+    count = 0
+    flag = []
+    if_contains_keyword = False
+    for i in weibo_content:
+        temp = remove_html_tags(etree.tostring(i))
+        temp = html_unicode_2_chinese(temp)
+        temp = process_str(temp)
+        if '展开全文c' in temp:
+            temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count]))
+            temp = html_unicode_2_chinese(temp)
+            temp = process_str(temp)
+            flag.append(count)
+        if keyword in temp:
+            if_contains_keyword = True
+        weibo_content_str.append(temp)
+        print(temp)
+        count += 1
+    print(len(weibo_content_str))
+    return weibo_content_str, flag, if_contains_keyword

+ 14 - 0
get_weibo_content/get_mid.py

@@ -0,0 +1,14 @@
+from lxml import etree
+
+
+def get_mid(html):
+    selector = etree.HTML(html)
+    weibo_mid = selector.xpath('//div[@class="card-wrap"]')
+    weibo_mid_str = []
+    for i in weibo_mid:
+        temp = i.get("mid")
+        if temp:
+            weibo_mid_str.append(temp)
+            print(temp)
+    print(len(weibo_mid_str))
+    return weibo_mid

+ 25 - 0
get_weibo_content/get_one_page.py

@@ -0,0 +1,25 @@
+import requests
+
+
+def get_one_page(keyword, page, date_begin, date_end, proxy_ip, cookie, user_agent):
+    user = "lutingsong"
+    password = "6cac3hci"
+    proxie = {
+        'http': 'http://' + user + ':' + password + '@' + proxy_ip,
+        'https': 'https://' + user + ':' + password + '@' + proxy_ip
+    }
+    params = {
+        'q': f'{keyword}',
+        'typeall': 1,
+        'suball': 1,
+        'timescope': 'custom:' + date_begin + ':' + date_end,
+        'Refer': 'g',
+        'page': page,
+    }
+    headers = {
+        'Cookie': cookie,
+        'User_Agent': user_agent
+    }
+    url = 'https://s.weibo.com/weibo?'  # 请求api
+    html = requests.get(url, params=params, headers=headers).content
+    return html

+ 30 - 0
get_weibo_content/get_time.py

@@ -0,0 +1,30 @@
+from lxml import etree
+from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
+
+
+def get_time(html, flag):
+    selector = etree.HTML(html)
+    weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
+    weibo_time_str = []
+    count = 0
+    count_s = 0
+    for i in weibo_time:
+        if count in flag:
+            temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
+            count_s += 1
+        else:
+            temp = remove_html_tags(etree.tostring(i))
+        temp = html_unicode_2_chinese(temp)
+        temp = process_str(temp)
+        temp = reprocess_str(temp)
+        weibo_time_str.append(temp)
+        count += 1
+        print(temp)
+    print(len(weibo_time_str))
+    return weibo_time_str
+
+
+def reprocess_str(unprocessed_str):
+    temp = unprocessed_str.replace(' ', '')
+    temp = temp.replace('来自', '')
+    return temp

+ 21 - 0
get_weibo_content/process_data.py

@@ -0,0 +1,21 @@
+import re
+from html import unescape
+
+
+# input:str
+def remove_html_tags(html_str):
+    temp = re.sub(r'<.*?>', '', str(html_str))
+    processed_str = temp.replace('\n', '')
+    return processed_str
+
+
+# input:str
+def html_unicode_2_chinese(html_unicode):
+    chinese = unescape(html_unicode)
+    return chinese
+
+
+def process_str(unprocessed_str):
+    temp = unprocessed_str.replace('  ', '')
+    temp = temp.replace('\\n', '')
+    return temp

+ 39 - 0
main.py

@@ -0,0 +1,39 @@
+from get_weibo_content.get_one_page import get_one_page
+from get_weibo_content.get_content import get_content
+from get_weibo_content.get_mid import get_mid
+from get_weibo_content.get_time import get_time
+import datetime
+import time
+
+
+def run_from_time_a_2_time_b(keyword_temp, date_begin_temp, date_end_temp, proxy_temp, cookie_temp, user_agent_temp):
+    begin_num = 1
+    page_count = begin_num
+    while True:
+        html = get_one_page(keyword_temp, page_count, date_begin_temp, date_end_temp, proxy_temp, cookie_temp,
+                            user_agent_temp)
+        weibo_content_str, flag, if_contains_keyword = get_content(html, keyword)
+        if not if_contains_keyword:
+            break
+        get_mid(html)
+        get_time(html, flag)
+        time.sleep(10)
+        page_count += 1
+
+
+if __name__ == '__main__':
+    keyword = input('[-]请输入检索话题:')
+    date_str = input('[-]请输入需要查询的当天日期(格式:2021-07-01):')
+    date = datetime.datetime.strptime(date_str, "%Y-%m-%d")
+    # cookie = input('[-]请输入cookie:')
+    cookie = 'SUB=_2A25N_5x1DeRhGeBO4lsY9y_Pyz-IHXVvAyQ9rDV8PUJbkNAfLWH8kW1NRYEkPnoC7fl3RMKtu4E9iyGtx5ldeVcn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWLS0lcQryz4UlBfKyjai.L5NHD95Qceh.41KMpe050Ws4Dqcjz-cyLdspDqgYt; SINAGLOBAL=2352245042816.5166.1627033753029; ULV=1627033753032:1:1:1:2352245042816.5166.1627033753029:; UOR=,,graph.qq.com; login_sid_t=5ed1466aa2327aac6d83a6652aa1a60a; cross_origin_proto=SSL; _s_tentry=passport.weibo.com; Apache=2352245042816.5166.1627033753029; WBtopGlobal_register_version=2021072412; webim_unReadCount=%7B%22time%22%3A1627119847912%2C%22dm_pub_total%22%3A1%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D; appkey=; SSOLoginState=1627122726; wvr=6; WBStorage=2ceabba76d81138d|undefined'
+    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
+    proxy = '127.0.0.1:80'
+    for i in range(0, 24):
+        date_begin = date_str + '-' + str(i % 24)
+        if i == 23:
+            date_temp_str = datetime.datetime.strftime(date + datetime.timedelta(days=1), "%Y-%m-%d")
+            date_end = date_temp_str + '-' + str((i + 1) % 24)
+        else:
+            date_end = date_str + '-' + str((i + 1) % 24)
+        run_from_time_a_2_time_b(keyword, date_begin, date_end, proxy, cookie, user_agent)