Shellmiao
/
weibo.com-crawler


			
							12345678910111213141516171819202122232425262728
							from lxml import etree
from .ProcessTool import remove_html_tags, html_unicode_2_chinese, process_str


def get_content(html):
    # 初始化
    selector = etree.HTML(html)
    wei_bo_content_str = []
    count = 0
    # 检查是否已经结束
    notFound = selector.xpath('//div[@class="card card-no-result s-pt20b40"]')
    ifEnd = False
    if len(notFound) != 0:
        ifEnd = True
        return wei_bo_content_str, ifEnd
    # 分析数据内容
    wei_bo_content = selector.xpath('//div[@class="content"]')
    for i in wei_bo_content:
        temp = remove_html_tags(etree.tostring(i.xpath("p[2]")[0]))
        temp = html_unicode_2_chinese(temp)
        temp = process_str(temp)
        if '展开c' in temp:
            temp = remove_html_tags(etree.tostring(i.xpath("p[3]")[0]))
            temp = html_unicode_2_chinese(temp)
            temp = process_str(temp)
        wei_bo_content_str.append(temp)
        count += 1
    return wei_bo_content_str, ifEnd