Content.py 979 B

12345678910111213141516171819202122232425262728
  1. from lxml import etree
  2. from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str
  3. def get_content(html):
  4. # 初始化
  5. selector = etree.HTML(html)
  6. wei_bo_content_str = []
  7. count = 0
  8. # 检查是否已经结束
  9. notFound = selector.xpath('//div[@class="card card-no-result s-pt20b40"]')
  10. ifEnd = False
  11. if len(notFound) != 0:
  12. ifEnd = True
  13. return wei_bo_content_str, ifEnd
  14. # 分析数据内容
  15. wei_bo_content = selector.xpath('//div[@class="content"]')
  16. for i in wei_bo_content:
  17. temp = remove_html_tags(etree.tostring(i.xpath("p[2]")[0]))
  18. temp = html_unicode_2_chinese(temp)
  19. temp = process_str(temp)
  20. if '展开c' in temp:
  21. temp = remove_html_tags(etree.tostring(i.xpath("p[3]")[0]))
  22. temp = html_unicode_2_chinese(temp)
  23. temp = process_str(temp)
  24. wei_bo_content_str.append(temp)
  25. count += 1
  26. return wei_bo_content_str, ifEnd