from lxml import etree from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str def get_time(html, flag): selector = etree.HTML(html) weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]') weibo_time_str = [] count = 0 count_s = 0 for i in weibo_time: if count in flag: temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s])) count_s += 1 else: temp = remove_html_tags(etree.tostring(i)) temp = html_unicode_2_chinese(temp) temp = process_str(temp) temp = reprocess_str(temp) weibo_time_str.append(temp) count += 1 return weibo_time_str def reprocess_str(unprocessed_str): temp = unprocessed_str.replace(' ', '') temp = temp.replace('来自', '') return temp