get_time.py 942 B

123456789101112131415161718192021222324252627282930
  1. from lxml import etree
  2. from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
  3. def get_time(html, flag):
  4. selector = etree.HTML(html)
  5. weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
  6. weibo_time_str = []
  7. count = 0
  8. count_s = 0
  9. for i in weibo_time:
  10. if count in flag:
  11. temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
  12. count_s += 1
  13. else:
  14. temp = remove_html_tags(etree.tostring(i))
  15. temp = html_unicode_2_chinese(temp)
  16. temp = process_str(temp)
  17. temp = reprocess_str(temp)
  18. weibo_time_str.append(temp)
  19. count += 1
  20. print(temp)
  21. print(len(weibo_time_str))
  22. return weibo_time_str
  23. def reprocess_str(unprocessed_str):
  24. temp = unprocessed_str.replace(' ', '')
  25. temp = temp.replace('来自', '')
  26. return temp