Skip to content

Commit ffe89b7

Browse files
committed
KIT-2545 Remove ocr
1 parent 152e167 commit ffe89b7

File tree

9 files changed

+15
-211
lines changed

9 files changed

+15
-211
lines changed

.devcontainer/Dockerfile

-8
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,3 @@ FROM mcr.microsoft.com/devcontainers/python:3.8-bullseye
55
# They would conflict with our pinned versions
66
RUN pipx uninstall flake8
77
RUN pipx uninstall mypy
8-
9-
RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
10-
RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
11-
RUN apt update
12-
ENV ACCEPT_EULA=Y
13-
RUN apt-get update && apt-get install -y build-essential git libgdiplus libx11-dev python3-opencv tesseract-ocr tesseract-ocr-deu && \
14-
apt-get autoremove -y && \
15-
rm -rf /var/lib/apt/lists/*

README.md

+15-38
Original file line numberDiff line numberDiff line change
@@ -23,70 +23,47 @@ pre_processor_dict = {
2323
pipeline = Pipeline(pre_processor_dict)
2424
value = pipeline.consume('Input text', state)
2525
```
26-
26+
2727

2828
State is optional here and can be used to cache preprocessing data between pipeline calls.
2929

30-
The preprocessors that the pipeline should use have to be transmitted as keys within a dictionary.
31-
Some preprocessors also require additional data to function.
30+
The preprocessors that the pipeline should use have to be transmitted as keys within a dictionary.
31+
Some preprocessors also require additional data to function.
3232
The data has to be converted to a string-form and assigned to it's preprocessor within the dictionary.
3333

3434
This dictionary then needs to be transmitted to the pipeline through it's constructor.
3535

36-
```python
37-
import pdfplumber
38-
from ai_data_preprocessing_queue import Pipeline
39-
40-
state = {"image_to_string": {"lang": "deu", "config": "--psm 1"}}
41-
pipeline = Pipeline({'ocr': None, 'text_only': None})
42-
43-
value = ""
44-
with pdfplumber.open('test.pdf') as pdf:
45-
for page in pdf.pages:
46-
value += pipeline.consume(page, state)
47-
48-
```
49-
50-
Ocr step should be used first in the pipeline.
51-
52-
For more info about which preprocessors need data and how this data needs to be formatted, see the preprocessor list below.
53-
5436
Note: Pipeline has to be instantiated only once and can be reused.
5537

5638
## Existing preprocessors
5739

5840
### To Lower Case
59-
Name: to_lower
60-
Required additional data: -
41+
Name: to_lower
42+
Required additional data: -
6143
Converts the text to lower case characters.
6244

63-
### OCR
64-
Name: ocr
65-
Required additional data: -
66-
Converts jpg, tiff files or pdf pages into plain text. This step accepts a `np.ndarray` from `PIL.Image.open`, `PIL.JpegImagePlugin.JpegImageFile` or `pdfplumber.page.Page` as `item` parameter.
67-
6845
### Remove Numbers
69-
Name: remove_numbers
70-
Required additional data: -
46+
Name: remove_numbers
47+
Required additional data: -
7148
Removes all numbers from the text.
7249

7350
### Remove Punctuation
74-
Name: remove_punctuation
75-
Required additional data: -
51+
Name: remove_punctuation
52+
Required additional data: -
7653
Removes all special characters from the text.
7754

7855
### Text only
79-
Name: text_only
80-
Required additional data: -
56+
Name: text_only
57+
Required additional data: -
8158
Removes all special characters and numbers from the text.
8259

8360
### Spellcheck (Levenshtein)
84-
Name: spellcheck
85-
Required additional data: A string containing words, separated by newline, i.e. "word1\r\nword2"
61+
Name: spellcheck
62+
Required additional data: A string containing words, separated by newline, i.e. "word1\r\nword2"
8663
Takes a list of words which depict correct spelling. Words within the given text that are close to a word from this list will be replaced with the listed word.
8764

8865
### Regex replacement
89-
Name: regex_replacement
66+
Name: regex_replacement
9067
Required additional data: CSV-Data in string form with following line-format: <pattern>,<replacement>,<order>
9168
- pattern: a regex pattern that is to be found within the text
9269
- replacement: the word/text, with which any match should be replaced
@@ -95,7 +72,7 @@ Required additional data: CSV-Data in string form with following line-format: &l
9572
This one will take your text and search for occurences of specific entities. Those are replaced by keywords. Using this approach, two text corpa are similar if both contain IBAN/Phonenumbers/etc.
9673

9774
### Token Replacement
98-
Name: token_replacement
75+
Name: token_replacement
9976
Required additional data: CSV-Data in string form with following line-format: <text>,<replacement>,<order>
10077
- text: one or multiple words to search within the text
10178
- replacement: the word/text, with which any match should be replaced

ai_data_preprocessing_queue/Steps/ocr.py

-55
This file was deleted.

ai_data_preprocessing_queue/services/__init__.py

Whitespace-only changes.

ai_data_preprocessing_queue/services/image_file_services.py

-46
This file was deleted.

tests/test_data/test.jpg

-126 KB
Binary file not shown.

tests/test_data/test.pdf

-9.08 KB
Binary file not shown.

tests/test_data/test_180.tiff

-129 KB
Binary file not shown.

tests/test_pipeline.py

-64
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
11
import unittest
22
from os import path
33

4-
import cv2
5-
import pdfplumber
6-
from PIL import Image
7-
84
from ai_data_preprocessing_queue.Pipeline import Pipeline
9-
from ai_data_preprocessing_queue.services.image_file_services import scale_too_big_cv2_image, scale_too_big_image
105

116
ABS_PATH_TEST_DATA = path.join(path.dirname(path.abspath(__file__)), "test_data")
127

@@ -125,65 +120,6 @@ def test_spellcheck_should_not_throw_exception_for_short_values(self) -> None:
125120
value = pipeline.consume("k koipe artikel")
126121
self.assertEqual("k kopie artikel", value)
127122

128-
def test_ocr_jpg_image(self) -> None:
129-
pipeline = Pipeline({"ocr": None})
130-
state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
131-
text = ""
132-
with Image.open(path.join(ABS_PATH_TEST_DATA, "test.jpg")) as image:
133-
image.load()
134-
image.convert("RGBA")
135-
text += pipeline.consume(image, state)
136-
self.assertIn("There is a test of optical character recognition.", text)
137-
138-
def test_ocr_jpg_image_and_text_only(self) -> None:
139-
pipeline = Pipeline({"ocr": None, "text_only": None})
140-
state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
141-
text = ""
142-
with Image.open(path.join(ABS_PATH_TEST_DATA, "test.jpg")) as image:
143-
image.load()
144-
image.convert("RGBA")
145-
text += pipeline.consume(image, state)
146-
self.assertEqual(text.find("."), -1)
147-
148-
def test_ocr_pdf_text_layer(self) -> None:
149-
pipeline = Pipeline({"ocr": None})
150-
state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
151-
text = ""
152-
with pdfplumber.open(path.join(ABS_PATH_TEST_DATA, "test.pdf")) as pdf:
153-
for page in pdf.pages:
154-
text += pipeline.consume(page, state)
155-
self.assertIn("There is a test of optical character recognition.", text)
156-
157-
def test_ocr_tiff_cv2_image(self) -> None:
158-
image = cv2.imread(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"), 0)
159-
pipeline = Pipeline({"ocr": None})
160-
state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
161-
text = pipeline.consume(image, state)
162-
self.assertIn("There is a test of optical character recognition.", text)
163-
164-
def test_ocr_tiff_image_scaled(self) -> None:
165-
image = Image.open(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"))
166-
pipeline = Pipeline({"ocr": None})
167-
state = {"image_to_string": {"lang": "eng", "cut_of_size": 1700000}}
168-
text = pipeline.consume(image, state)
169-
self.assertIn("There is a test of optical character recognition.", text)
170-
171-
def test_scale_tiff_image(self) -> None:
172-
image = Image.open(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"))
173-
size = image.size[0] * image.size[1]
174-
scaling_factor = 10
175-
image = scale_too_big_image(image, cutoff_size=size / scaling_factor)
176-
scaled_size = image.size[0] * image.size[1] * scaling_factor**2
177-
self.assertAlmostEqual(1, scaled_size / size, places=1)
178-
179-
def test_scale_tiff_cv2_image(self) -> None:
180-
image = cv2.imread(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"), 0)
181-
size = image.shape[1] * image.shape[0]
182-
scaling_factor = 10
183-
image = scale_too_big_cv2_image(image, cutoff_size=size / scaling_factor)
184-
scaled_size = image.shape[0] * image.shape[1] * scaling_factor**2
185-
self.assertAlmostEqual(1, scaled_size / size, places=1)
186-
187123

188124
if __name__ == "__main__":
189125
unittest.main()

0 commit comments

Comments
 (0)