get_time.py 891 B

12345678910111213141516171819202122232425262728
  1. from lxml import etree
  2. from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
  3. def get_time(html, flag):
  4. selector = etree.HTML(html)
  5. weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
  6. weibo_time_str = []
  7. count = 0
  8. count_s = 0
  9. for i in weibo_time:
  10. if count in flag:
  11. temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
  12. count_s += 1
  13. else:
  14. temp = remove_html_tags(etree.tostring(i))
  15. temp = html_unicode_2_chinese(temp)
  16. temp = process_str(temp)
  17. temp = reprocess_str(temp)
  18. weibo_time_str.append(temp)
  19. count += 1
  20. return weibo_time_str
  21. def reprocess_str(unprocessed_str):
  22. temp = unprocessed_str.replace(' ', '')
  23. temp = temp.replace('来自', '')
  24. return temp