get_content.py 936 B

12345678910111213141516171819202122232425
  1. from lxml import etree
  2. from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
  3. def get_content(html, keyword):
  4. selector = etree.HTML(html)
  5. weibo_content = selector.xpath('//div[@class="content"]/p[1]')
  6. weibo_content_str = []
  7. count = 0
  8. flag = []
  9. if_contains_keyword = False
  10. for i in weibo_content:
  11. temp = remove_html_tags(etree.tostring(i))
  12. temp = html_unicode_2_chinese(temp)
  13. temp = process_str(temp)
  14. if '展开全文c' in temp:
  15. temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count]))
  16. temp = html_unicode_2_chinese(temp)
  17. temp = process_str(temp)
  18. flag.append(count)
  19. if keyword in temp:
  20. if_contains_keyword = True
  21. weibo_content_str.append(temp)
  22. count += 1
  23. return weibo_content_str, flag, if_contains_keyword