Skip to content

Commit fcef21e

Browse files
committed
Add doctest examples and clean up docstrings
1 parent 4cd803a commit fcef21e

File tree

1 file changed

+20
-14
lines changed

1 file changed

+20
-14
lines changed

machine_learning/word_frequency_functions.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343
def term_frequency(term : str, document : str) -> int:
4444
"""
45-
A function that returns the number of times a term occurs within
45+
Return the number of times a term occurs within
4646
a given document.
4747
@params: term, the term to search a document for, and document,
4848
the document to search within
@@ -58,15 +58,14 @@ def term_frequency(term : str, document : str) -> int:
5858
str.maketrans("", "", string.punctuation)
5959
).replace("\n", "")
6060
tokenize_document = document_without_punctuation.split(" ") # word tokenization
61-
term_frequency = len(
61+
return len(
6262
[word for word in tokenize_document if word.lower() == term.lower()]
6363
)
64-
return term_frequency
6564

6665

6766
def document_frequency(term: str, corpus: str) -> int:
6867
"""
69-
A function that calculates the number of documents in a corpus that contain a
68+
Calculate the number of documents in a corpus that contain a
7069
given term
7170
@params : term, the term to search each document for, and corpus, a collection of
7271
documents. Each document should be separated by a newline.
@@ -83,15 +82,14 @@ def document_frequency(term: str, corpus: str) -> int:
8382
) # strip all punctuation and replace it with ''
8483
documents = corpus_without_punctuation.split("\n")
8584
lowercase_documents = [document.lower() for document in documents]
86-
document_frequency = len(
85+
return len(
8786
[document for document in lowercase_documents if term.lower() in document]
88-
) # number of documents that contain the term
89-
return document_frequency, len(documents)
87+
), len(documents)
9088

9189

9290
def inverse_document_frequency(df : int, N: int) -> float:
9391
"""
94-
A function that returns an integer denoting the importance
92+
Return an integer denoting the importance
9593
of a word. This measure of importance is
9694
calculated by log10(N/df), where N is the
9795
number of documents and df is
@@ -100,19 +98,27 @@ def inverse_document_frequency(df : int, N: int) -> float:
10098
the number of documents in the corpus.
10199
@returns : log10(N/df)
102100
@examples :
101+
>>> inverse_document_frequency(3, 0)
102+
Traceback (most recent call last):
103+
...
104+
ValueError: log10(0) is undefined.
103105
>>> inverse_document_frequency(1, 3)
104106
0.477
107+
>>> inverse_document_frequency(0, 3)
108+
Traceback (most recent call last):
109+
...
110+
ZeroDivisionError: df must be > 0
105111
"""
106-
try:
107-
idf = round(log10(N / df), 3)
108-
return idf
109-
except ZeroDivisionError:
110-
print("The term you searched for is not in the corpus.")
112+
if df == 0:
113+
raise ZeroDivisionError("df must be > 0")
114+
elif N == 0:
115+
raise ValueError("log10(0) is undefined.")
116+
return round(log10(N / df), 3)
111117

112118

113119
def tf_idf(tf : int, idf: int) -> float:
114120
"""
115-
A function that combines the term frequency
121+
Combine the term frequency
116122
and inverse document frequency functions to
117123
calculate the originality of a term. This
118124
'originality' is calculated by multiplying

0 commit comments

Comments
 (0)