language_utils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. """
  2. These codes are adopted from LEAF.
  3. """
  4. import json
  5. import re
  6. import numpy as np
  7. # ------------------------
  8. # utils for shakespeare dataset
  9. ALL_LETTERS = "\n !\"&'(),-.0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz}"
  10. NUM_LETTERS = len(ALL_LETTERS)
  11. def _one_hot(index, size):
  12. """returns one-hot vector with given size and value 1 at given index"""
  13. vec = [0 for _ in range(size)]
  14. vec[int(index)] = 1
  15. return vec
  16. def letter_to_vec(letter):
  17. """returns one-hot representation of given letter"""
  18. index = ALL_LETTERS.find(letter)
  19. return _one_hot(index, NUM_LETTERS)
  20. def word_to_indices(word):
  21. """returns a list of character indices
  22. Args:
  23. word: string
  24. Return:
  25. indices: int list with length len(word)
  26. """
  27. indices = []
  28. for c in word:
  29. indices.append(ALL_LETTERS.find(c))
  30. return indices
  31. # ------------------------
  32. # utils for sent140 dataset
  33. def split_line(line):
  34. """split given line/phrase into list of words
  35. Args:
  36. line: string representing phrase to be split
  37. Return:
  38. list of strings, with each string representing a word
  39. """
  40. return re.findall(r"[\w']+|[.,!?;]", line)
  41. def _word_to_index(word, indd):
  42. """returns index of given word based on given lookup dictionary
  43. returns the length of the lookup dictionary if word not found
  44. Args:
  45. word: string
  46. indd: dictionary with string words as keys and int indices as values
  47. """
  48. if word in indd:
  49. return indd[word]
  50. else:
  51. return len(indd)
  52. def line_to_indices(line, word2id, max_words=25):
  53. """converts given phrase into list of word indices
  54. if the phrase has more than max_words words, returns a list containing
  55. indices of the first max_words words
  56. if the phrase has less than max_words words, repeatedly appends integer
  57. representing unknown index to returned list until the list's length is
  58. max_words
  59. Args:
  60. line: string representing phrase/sequence of words
  61. word2id: dictionary with string words as keys and int indices as values
  62. max_words: maximum number of word indices in returned list
  63. Return:
  64. indl: list of word indices, one index for each word in phrase
  65. """
  66. unk_id = len(word2id)
  67. line_list = split_line(line) # split phrase in words
  68. indl = [word2id[w] if w in word2id else unk_id for w in line_list[:max_words]]
  69. indl += [unk_id] * (max_words - len(indl))
  70. return indl
  71. def bag_of_words(line, vocab):
  72. """returns bag of words representation of given phrase using given vocab
  73. Args:
  74. line: string representing phrase to be parsed
  75. vocab: dictionary with words as keys and indices as values
  76. Return:
  77. integer list
  78. """
  79. bag = [0] * len(vocab)
  80. words = split_line(line)
  81. for w in words:
  82. if w in vocab:
  83. bag[vocab[w]] += 1
  84. return bag
  85. def get_word_emb_arr(path):
  86. with open(path, 'r') as inf:
  87. embs = json.load(inf)
  88. vocab = embs['vocab']
  89. word_emb_arr = np.array(embs['emba'])
  90. indd = {}
  91. for i in range(len(vocab)):
  92. indd[vocab[i]] = i
  93. vocab = {w: i for i, w in enumerate(embs['vocab'])}
  94. return word_emb_arr, indd, vocab
  95. def val_to_vec(size, val):
  96. """Converts target into one-hot.
  97. Args:
  98. size: Size of vector.
  99. val: Integer in range [0, size].
  100. Returns:
  101. vec: one-hot vector with a 1 in the val element.
  102. """
  103. assert 0 <= val < size
  104. vec = [0 for _ in range(size)]
  105. vec[int(val)] = 1
  106. return vec