process_data.py 452 B

123456789101112131415161718192021
  1. import re
  2. from html import unescape
  3. # input:str
  4. def remove_html_tags(html_str):
  5. temp = re.sub(r'<.*?>', '', html_str.decode("utf-8"))
  6. processed_str = temp.replace('\n', '')
  7. return processed_str
  8. # input:str
  9. def html_unicode_2_chinese(html_unicode):
  10. chinese = unescape(html_unicode)
  11. return chinese
  12. def process_str(unprocessed_str):
  13. temp = unprocessed_str.replace(' ', '')
  14. temp = temp.replace('\\n', '')
  15. return temp