from lxml import etree from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str def get_content(html, keyword): selector = etree.HTML(html) weibo_content = selector.xpath('//div[@class="content"]/p[1]') weibo_content_str = [] count = 0 flag = [] if_contains_keyword = False for i in weibo_content: temp = remove_html_tags(etree.tostring(i)) temp = html_unicode_2_chinese(temp) temp = process_str(temp) if '展开全文c' in temp: temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[2]')[count])) temp = html_unicode_2_chinese(temp) temp = process_str(temp) flag.append(count) if keyword in temp: if_contains_keyword = True weibo_content_str.append(temp) count += 1 return weibo_content_str, flag, if_contains_keyword