1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
# legit - https://www.askpython.com/python/examples/tf-idf-model-from-scratch
import numpy as np
from nltk.tokenize import word_tokenize
from typing import List, Dict
def create_counts(texts: List):
sentences = []
word_set = []
for sent in texts:
# split into words / разбиваем пословно только слова
x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
sentences.append(x)
for word in x:
if word not in word_set:
word_set.append(word)
# set a vocab of unique words / создаем словарь уникальных слов
word_set = set(word_set)
total_documents = len(sentences)
# Creating an index for each word in our vocab / каждому слову уникальный индекс
index_dict = {}
i = 0
for word in word_set:
index_dict[word] = i
i += 1
return sentences, word_set, total_documents, index_dict
def count_dict(sentences: List, word_set: set) -> Dict:
"""
Counts of words.
@sentences: the list of sentences
@word_set: all words without their unique ids
return: frequencies of each word
"""
word_count = {}
for word in word_set:
word_count[word] = 0
for sent in sentences:
if word in sent:
word_count[word] += 1
return word_count
def termfreq(document: List, word: str) -> float:
"""
Count the term frequency according to the formula
(num of term occurencies in the doc d) / (num of non-unique terms in doc d)
@document: a list of words in the doc
@word: a unique word
return: TF value
"""
# occurance = len([token for token in document if token == word])
occurance = document.count(word)
return occurance / len(document)
def inverse_doc_freq(word: str, total_documents: int, word_count: Dict):
"""
Count the IDF according to the formula
log (num of docs / num of docs with term t)
@word: a unique word
@total_documents: num of docs in corpus
@word_count: word frequencies {word: number of docs with word}
return: IDF value
"""
word_occurance = word_count.get(word, 0) + 1
return np.log(total_documents / word_occurance)
def tf_idf(sentence: List[str], vector_shape: int, index_dict: Dict, total_documents: int, word_count: Dict) -> np.array:
"""
Get the sentence tf-idf vector
@sentence: list of words in a sentence
@vector_shape: number of unique words in corpus
@index_dict: ids of words
@total_documents: num of docs in corpus
@word_count: word frequencies {word: number of docs with word}
return: tf-idf vector as np.array
"""
tf_idf_vec = np.zeros((vector_shape, ))
for word in sentence:
tf = termfreq(sentence, word)
idf = inverse_doc_freq(word, total_documents, word_count)
tf_idf_vec[index_dict[word]] = tf * idf
return tf_idf_vec
def create_vectors(texts: List):
vectors = []
sentences, word_set, total_documents, index_dict = create_counts(texts)
vector_shape = len(word_set)
word_count = count_dict(sentences, word_set)
for sent in sentences:
vec = tf_idf(sent, vector_shape, index_dict, total_documents, word_count)
vectors.append(vec)
vectors = np.array(vectors)
return vectors, index_dict
sentences = ['This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',]
vectors, word2id = create_vectors(sentences)
vectors.shape
|