get_time.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. from lxml import etree
  2. import time
  3. from get_weibo_content.process_data import remove_html_tags, html_unicode_2_chinese, process_str
  4. def get_time(html, flag, date_begin):
  5. selector = etree.HTML(html)
  6. weibo_time = selector.xpath('//div[@class="content"]/p[2]/a[1]')
  7. weibo_time_str = []
  8. count = 0
  9. count_s = 0
  10. for i in weibo_time:
  11. if count in flag:
  12. temp = remove_html_tags(etree.tostring(selector.xpath('//div[@class="content"]/p[3]/a[1]')[count_s]))
  13. count_s += 1
  14. else:
  15. temp = remove_html_tags(etree.tostring(i))
  16. temp = html_unicode_2_chinese(temp)
  17. temp = process_str(temp)
  18. temp = reprocess_str(temp)
  19. if '今天' in temp:
  20. temp = temp.replace('今天', '-')
  21. temp = time.strftime("%Y-%m-%d", time.localtime()) + temp
  22. else:
  23. temp = date_begin[0:4] + '-' + temp
  24. weibo_time_str.append(temp)
  25. count += 1
  26. return weibo_time_str
  27. def reprocess_str(unprocessed_str):
  28. temp = unprocessed_str.replace(' ', '')
  29. temp = temp.replace('来自', '')
  30. temp = temp.replace('\xa0', '')
  31. temp = temp.replace('月', '-')
  32. temp = temp.replace('日', '-')
  33. return temp