| 123456789101112131415161718192021222324252627282930 |
- from lxml import etree
- from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
- def get_time(html, flag):
- selector = etree.HTML(html)
- weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
- weibo_time_str = []
- count = 0
- count_s = 0
- for i in weibo_time:
- if count in flag:
- temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
- count_s += 1
- else:
- temp = remove_html_tags(etree.tostring(i))
- temp = html_unicode_2_chinese(temp)
- temp = process_str(temp)
- temp = reprocess_str(temp)
- weibo_time_str.append(temp)
- count += 1
- print(temp)
- print(len(weibo_time_str))
- return weibo_time_str
- def reprocess_str(unprocessed_str):
- temp = unprocessed_str.replace(' ', '')
- temp = temp.replace('来自', '')
- return temp
|