12345678910111213141516171819202122232425 |
- from lxml import etree
- from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
- def get_content(html, keyword):
- selector = etree.HTML(html)
- weibo_content = selector.xpath('//div[@class="content"]/p[1]')
- weibo_content_str = []
- count = 0
- flag = []
- if_contains_keyword = False
- for i in weibo_content:
- temp = remove_html_tags(etree.tostring(i))
- temp = html_unicode_2_chinese(temp)
- temp = process_str(temp)
- if '展开全文c' in temp:
- temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count]))
- temp = html_unicode_2_chinese(temp)
- temp = process_str(temp)
- flag.append(count)
- if keyword in temp:
- if_contains_keyword = True
- weibo_content_str.append(temp)
- count += 1
- return weibo_content_str, flag, if_contains_keyword
|