SamhammerAG
diff --git a/‎.devcontainer/Dockerfile
-8 b/‎.devcontainer/Dockerfile
-8
diff --git a/‎README.md
+15-38 b/‎README.md
+15-38
diff --git a/‎ai_data_preprocessing_queue/Steps/ocr.py
-55 b/‎ai_data_preprocessing_queue/Steps/ocr.py
-55
diff --git a/‎ai_data_preprocessing_queue/services/__init__.py b/‎ai_data_preprocessing_queue/services/__init__.py
diff --git a/‎ai_data_preprocessing_queue/services/image_file_services.py
-46 b/‎ai_data_preprocessing_queue/services/image_file_services.py
-46
diff --git a/‎tests/test_data/test.jpg
-126 KB b/‎tests/test_data/test.jpg
-126 KB
diff --git a/‎tests/test_data/test.pdf
-9.08 KB b/‎tests/test_data/test.pdf
-9.08 KB
diff --git a/‎tests/test_data/test_180.tiff
-129 KB b/‎tests/test_data/test_180.tiff
-129 KB
diff --git a/‎tests/test_pipeline.py
-64 b/‎tests/test_pipeline.py
-64
@@ -5,11 +5,3 @@ FROM mcr.microsoft.com/devcontainers/python:3.8-bullseye
 # They would conflict with our pinned versions
 RUN pipx uninstall flake8
 RUN pipx uninstall mypy
-
-RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
-RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list
-RUN apt update
-ENV ACCEPT_EULA=Y
-RUN apt-get update && apt-get install -y build-essential git libgdiplus libx11-dev python3-opencv tesseract-ocr tesseract-ocr-deu && \
-    apt-get autoremove -y && \
-    rm -rf /var/lib/apt/lists/*
@@ -23,70 +23,47 @@ pre_processor_dict = {
 pipeline = Pipeline(pre_processor_dict)
 value = pipeline.consume('Input text', state)
 ```
-        
+
 
 State is optional here and can be used to cache preprocessing data between pipeline calls.
 
-The preprocessors that the pipeline should use have to be transmitted as keys within a dictionary.  
-Some preprocessors also require additional data to function.  
+The preprocessors that the pipeline should use have to be transmitted as keys within a dictionary.
+Some preprocessors also require additional data to function.
 The data has to be converted to a string-form and assigned to it's preprocessor within the dictionary.
 
 This dictionary then needs to be transmitted to the pipeline through it's constructor.
 
-```python
-import pdfplumber
-from ai_data_preprocessing_queue import Pipeline
-
-state = {"image_to_string": {"lang": "deu", "config": "--psm 1"}}
-pipeline = Pipeline({'ocr': None, 'text_only': None})
-
-value = ""
-with pdfplumber.open('test.pdf') as pdf:
-    for page in pdf.pages:
-        value += pipeline.consume(page, state)
-
-```
-
-Ocr step should be used first in the pipeline.
-
-For more info about which preprocessors need data and how this data needs to be formatted, see the preprocessor list below.
-
 Note: Pipeline has to be instantiated only once and can be reused.
 
 ## Existing preprocessors
 
 ### To Lower Case
-Name: to_lower  
-Required additional data: -  
+Name: to_lower
+Required additional data: -
 Converts the text to lower case characters.
 
-### OCR
-Name: ocr  
-Required additional data: -  
-Converts jpg, tiff files or pdf pages into plain text. This step accepts a `np.ndarray` from `PIL.Image.open`, `PIL.JpegImagePlugin.JpegImageFile` or `pdfplumber.page.Page` as `item` parameter.
-
 ### Remove Numbers
-Name: remove_numbers  
-Required additional data: -  
+Name: remove_numbers
+Required additional data: -
 Removes all numbers from the text.
 
 ### Remove Punctuation
-Name: remove_punctuation  
-Required additional data: -  
+Name: remove_punctuation
+Required additional data: -
 Removes all special characters from the text.
 
 ### Text only
-Name: text_only  
-Required additional data: -  
+Name: text_only
+Required additional data: -
 Removes all special characters and numbers from the text.
 
 ### Spellcheck (Levenshtein)
-Name: spellcheck  
-Required additional data: A string containing words, separated by newline, i.e. "word1\r\nword2"  
+Name: spellcheck
+Required additional data: A string containing words, separated by newline, i.e. "word1\r\nword2"
 Takes a list of words which depict correct spelling. Words within the given text that are close to a word from this list will be replaced with the listed word.
 
 ### Regex replacement
-Name: regex_replacement  
+Name: regex_replacement
 Required additional data: CSV-Data in string form with following line-format: &lt;pattern&gt;,&lt;replacement&gt;,&lt;order&gt;
   - pattern: a regex pattern that is to be found within the text
   - replacement: the word/text, with which any match should be replaced
@@ -95,7 +72,7 @@ Required additional data: CSV-Data in string form with following line-format: &l
 This one will take your text and search for occurences of specific entities. Those are replaced by keywords. Using this approach, two text corpa are similar if both contain IBAN/Phonenumbers/etc.
 
 ### Token Replacement
-Name: token_replacement  
+Name: token_replacement
 Required additional data: CSV-Data in string form with following line-format: &lt;text&gt;,&lt;replacement&gt;,&lt;order&gt;
   - text: one or multiple words to search within the text
   - replacement: the word/text, with which any match should be replaced
 
@@ -1,12 +1,7 @@
 import unittest
 from os import path
 
-import cv2
-import pdfplumber
-from PIL import Image
-
 from ai_data_preprocessing_queue.Pipeline import Pipeline
-from ai_data_preprocessing_queue.services.image_file_services import scale_too_big_cv2_image, scale_too_big_image
 
 ABS_PATH_TEST_DATA = path.join(path.dirname(path.abspath(__file__)), "test_data")
 
@@ -125,65 +120,6 @@ def test_spellcheck_should_not_throw_exception_for_short_values(self) -> None:
         value = pipeline.consume("k koipe artikel")
         self.assertEqual("k kopie artikel", value)
 
-    def test_ocr_jpg_image(self) -> None:
-        pipeline = Pipeline({"ocr": None})
-        state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
-        text = ""
-        with Image.open(path.join(ABS_PATH_TEST_DATA, "test.jpg")) as image:
-            image.load()
-            image.convert("RGBA")
-            text += pipeline.consume(image, state)
-        self.assertIn("There is a test of optical character recognition.", text)
-
-    def test_ocr_jpg_image_and_text_only(self) -> None:
-        pipeline = Pipeline({"ocr": None, "text_only": None})
-        state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
-        text = ""
-        with Image.open(path.join(ABS_PATH_TEST_DATA, "test.jpg")) as image:
-            image.load()
-            image.convert("RGBA")
-            text += pipeline.consume(image, state)
-        self.assertEqual(text.find("."), -1)
-
-    def test_ocr_pdf_text_layer(self) -> None:
-        pipeline = Pipeline({"ocr": None})
-        state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
-        text = ""
-        with pdfplumber.open(path.join(ABS_PATH_TEST_DATA, "test.pdf")) as pdf:
-            for page in pdf.pages:
-                text += pipeline.consume(page, state)
-        self.assertIn("There is a test of optical character recognition.", text)
-
-    def test_ocr_tiff_cv2_image(self) -> None:
-        image = cv2.imread(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"), 0)
-        pipeline = Pipeline({"ocr": None})
-        state = {"image_to_string": {"lang": "eng", "config": "--psm 1"}}
-        text = pipeline.consume(image, state)
-        self.assertIn("There is a test of optical character recognition.", text)
-
-    def test_ocr_tiff_image_scaled(self) -> None:
-        image = Image.open(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"))
-        pipeline = Pipeline({"ocr": None})
-        state = {"image_to_string": {"lang": "eng", "cut_of_size": 1700000}}
-        text = pipeline.consume(image, state)
-        self.assertIn("There is a test of optical character recognition.", text)
-
-    def test_scale_tiff_image(self) -> None:
-        image = Image.open(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"))
-        size = image.size[0] * image.size[1]
-        scaling_factor = 10
-        image = scale_too_big_image(image, cutoff_size=size / scaling_factor)
-        scaled_size = image.size[0] * image.size[1] * scaling_factor**2
-        self.assertAlmostEqual(1, scaled_size / size, places=1)
-
-    def test_scale_tiff_cv2_image(self) -> None:
-        image = cv2.imread(path.join(ABS_PATH_TEST_DATA, "test_180.tiff"), 0)
-        size = image.shape[1] * image.shape[0]
-        scaling_factor = 10
-        image = scale_too_big_cv2_image(image, cutoff_size=size / scaling_factor)
-        scaled_size = image.shape[0] * image.shape[1] * scaling_factor**2
-        self.assertAlmostEqual(1, scaled_size / size, places=1)
-
 
 if __name__ == "__main__":
     unittest.main()