Shellmiao
/
Weibo_Crawler


			
							12345678910111213141516171819202122232425
							from lxml import etree
from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str


def get_content(html, keyword):
    selector = etree.HTML(html)
    weibo_content = selector.xpath('//div[@class="content"]/p[1]')
    weibo_content_str = []
    count = 0
    flag = []
    if_contains_keyword = False
    for i in weibo_content:
        temp = remove_html_tags(etree.tostring(i))
        temp = html_unicode_2_chinese(temp)
        temp = process_str(temp)
        if '展开全文c' in temp:
            temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count]))
            temp = html_unicode_2_chinese(temp)
            temp = process_str(temp)
            flag.append(count)
        if keyword in temp:
            if_contains_keyword = True
        weibo_content_str.append(temp)
        count += 1
    return weibo_content_str, flag, if_contains_keyword