from lxml import etree import time from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str def get_time(html, flag, date_begin): selector = etree.HTML(html) weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]') weibo_time_str = [] count = 0 count_s = 0 for i in weibo_time: if count in flag: temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s])) count_s += 1 else: temp = remove_html_tags(etree.tostring(i)) temp = html_unicode_2_chinese(temp) temp = process_str(temp) temp = reprocess_str(temp) if '今天' in temp: temp = temp.replace('今天', '-') temp = time.strftime("%Y-%m-%d", time.localtime()) + temp else: temp = date_begin[0:4] + '-' + temp weibo_time_str.append(temp) count += 1 return weibo_time_str def reprocess_str(unprocessed_str): temp = unprocessed_str.replace(' ', '') temp = temp.replace('来自', '') temp = temp.replace('\xa0', '') temp = temp.replace('月', '-') temp = temp.replace('日', '-') return temp