import re from html import unescape # input:str def remove_html_tags(html_str): temp = re.sub(r'<.*?>', '', html_str.decode("utf-8")) processed_str = temp.replace('\n', '') return processed_str # input:str def html_unicode_2_chinese(html_unicode): chinese = unescape(html_unicode) return chinese def process_str(unprocessed_str): temp = unprocessed_str.replace(' ', '') temp = temp.replace('\\n', '') return temp