12345678910111213141516171819202122232425262728 |
- from lxml import etree
- from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str
- def get_content(html):
- # 初始化
- selector = etree.HTML(html)
- wei_bo_content_str = []
- count = 0
- # 检查是否已经结束
- notFound = selector.xpath('//div[@class="card card-no-result s-pt20b40"]')
- ifEnd = False
- if len(notFound) != 0:
- ifEnd = True
- return wei_bo_content_str, ifEnd
- # 分析数据内容
- wei_bo_content = selector.xpath('//div[@class="content"]')
- for i in wei_bo_content:
- temp = remove_html_tags(etree.tostring(i.xpath("p[2]")[0]))
- temp = html_unicode_2_chinese(temp)
- temp = process_str(temp)
- if '展开c' in temp:
- temp = remove_html_tags(etree.tostring(i.xpath("p[3]")[0]))
- temp = html_unicode_2_chinese(temp)
- temp = process_str(temp)
- wei_bo_content_str.append(temp)
- count += 1
- return wei_bo_content_str, ifEnd
|