from lxml import etree from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str def get_content(html): # 初始化 selector = etree.HTML(html) wei_bo_content_str = [] count = 0 # 检查是否已经结束 notFound = selector.xpath('//div[@class="card card-no-result s-pt20b40"]') ifEnd = False if len(notFound) != 0: ifEnd = True return wei_bo_content_str, ifEnd # 分析数据内容 wei_bo_content = selector.xpath('//div[@class="content"]') for i in wei_bo_content: temp = remove_html_tags(etree.tostring(i.xpath("p[2]")[0])) temp = html_unicode_2_chinese(temp) temp = process_str(temp) if '展开c' in temp: temp = remove_html_tags(etree.tostring(i.xpath("p[3]")[0])) temp = html_unicode_2_chinese(temp) temp = process_str(temp) wei_bo_content_str.append(temp) count += 1 return wei_bo_content_str, ifEnd