42
42
43
43
def term_frequency (term : str , document : str ) -> int :
44
44
"""
45
- A function that returns the number of times a term occurs within
45
+ Return the number of times a term occurs within
46
46
a given document.
47
47
@params: term, the term to search a document for, and document,
48
48
the document to search within
@@ -58,15 +58,14 @@ def term_frequency(term : str, document : str) -> int:
58
58
str .maketrans ("" , "" , string .punctuation )
59
59
).replace ("\n " , "" )
60
60
tokenize_document = document_without_punctuation .split (" " ) # word tokenization
61
- term_frequency = len (
61
+ return len (
62
62
[word for word in tokenize_document if word .lower () == term .lower ()]
63
63
)
64
- return term_frequency
65
64
66
65
67
66
def document_frequency (term : str , corpus : str ) -> int :
68
67
"""
69
- A function that calculates the number of documents in a corpus that contain a
68
+ Calculate the number of documents in a corpus that contain a
70
69
given term
71
70
@params : term, the term to search each document for, and corpus, a collection of
72
71
documents. Each document should be separated by a newline.
@@ -83,15 +82,14 @@ def document_frequency(term: str, corpus: str) -> int:
83
82
) # strip all punctuation and replace it with ''
84
83
documents = corpus_without_punctuation .split ("\n " )
85
84
lowercase_documents = [document .lower () for document in documents ]
86
- document_frequency = len (
85
+ return len (
87
86
[document for document in lowercase_documents if term .lower () in document ]
88
- ) # number of documents that contain the term
89
- return document_frequency , len (documents )
87
+ ), len (documents )
90
88
91
89
92
90
def inverse_document_frequency (df : int , N : int ) -> float :
93
91
"""
94
- A function that returns an integer denoting the importance
92
+ Return an integer denoting the importance
95
93
of a word. This measure of importance is
96
94
calculated by log10(N/df), where N is the
97
95
number of documents and df is
@@ -100,19 +98,27 @@ def inverse_document_frequency(df : int, N: int) -> float:
100
98
the number of documents in the corpus.
101
99
@returns : log10(N/df)
102
100
@examples :
101
+ >>> inverse_document_frequency(3, 0)
102
+ Traceback (most recent call last):
103
+ ...
104
+ ValueError: log10(0) is undefined.
103
105
>>> inverse_document_frequency(1, 3)
104
106
0.477
107
+ >>> inverse_document_frequency(0, 3)
108
+ Traceback (most recent call last):
109
+ ...
110
+ ZeroDivisionError: df must be > 0
105
111
"""
106
- try :
107
- idf = round ( log10 ( N / df ), 3 )
108
- return idf
109
- except ZeroDivisionError :
110
- print ( "The term you searched for is not in the corpus." )
112
+ if df == 0 :
113
+ raise ZeroDivisionError ( "df must be > 0" )
114
+ elif N == 0 :
115
+ raise ValueError ( "log10(0) is undefined." )
116
+ return round ( log10 ( N / df ), 3 )
111
117
112
118
113
119
def tf_idf (tf : int , idf : int ) -> float :
114
120
"""
115
- A function that combines the term frequency
121
+ Combine the term frequency
116
122
and inverse document frequency functions to
117
123
calculate the originality of a term. This
118
124
'originality' is calculated by multiplying
0 commit comments