Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print(json.dumps(self.parameter))
class DummyProcessorWithRequiredParameters(Processor):
def process(self): pass
def __init__(self, *args, **kwargs):
kwargs['version'] = '0.0.1'
kwargs['ocrd_tool'] = {
'executable': 'ocrd-test',
'steps': ['recognition/post-correction'],
'parameters': {
'i-am-required': {'required': True}
}
}
super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)
class IncompleteProcessor(Processor):
pass
class TestProcessor(TestCase):
def setUp(self):
self.resolver = Resolver()
self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
def test_incomplete_processor(self):
proc = IncompleteProcessor(None)
with self.assertRaisesRegex(Exception, 'Must be implemented'):
proc.process()
def test_no_resolver(self):
with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'):
run_processor(DummyProcessor)
def test_params(self):
proc = Processor(workspace=self.workspace)
self.assertEqual(proc.parameter, {})
'default': 'bla'
}
}
}
class DummyProcessor(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = DUMMY_TOOL
kwargs['version'] = '0.0.1'
super(DummyProcessor, self).__init__(*args, **kwargs)
def process(self):
print(json.dumps(self.parameter))
class DummyProcessorWithRequiredParameters(Processor):
def process(self): pass
def __init__(self, *args, **kwargs):
kwargs['version'] = '0.0.1'
kwargs['ocrd_tool'] = {
'executable': 'ocrd-test',
'steps': ['recognition/post-correction'],
'parameters': {
'i-am-required': {'required': True}
}
}
super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)
class IncompleteProcessor(Processor):
pass
class TestProcessor(TestCase):
from ocrd.resolver import Resolver
from ocrd.processor.base import Processor, run_processor, run_cli
DUMMY_TOOL = {
'executable': 'ocrd-test',
'steps': ['recognition/post-correction'],
'parameters': {
'baz': {
'type': 'string',
'default': 'bla'
}
}
}
class DummyProcessor(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = DUMMY_TOOL
kwargs['version'] = '0.0.1'
super(DummyProcessor, self).__init__(*args, **kwargs)
def process(self):
print(json.dumps(self.parameter))
class DummyProcessorWithRequiredParameters(Processor):
def process(self): pass
def __init__(self, *args, **kwargs):
kwargs['version'] = '0.0.1'
kwargs['ocrd_tool'] = {
'executable': 'ocrd-test',
'steps': ['recognition/post-correction'],
from __future__ import absolute_import
import os
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('Tesseract3RegionSegmenter')
class Tesseract3RegionSegmenter(Processor):
def process(self):
"""
Performs the region segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image = self.workspace.resolve_image_as_pil(page.imageFileName)
log.debug("Detecting regions with tesseract")
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
box, index = component[1], component[2]
# the region reference in the reading order element
ID = "r%i" % index
page.add_reading_order_ref(ID, index)
# -*- coding: utf-8 -*-
from __future__ import absolute_import
# import re
import exiftool
from ocrd.constants import EXIF_COMPRESSION_METHODS, EXIF_PHOTOMETRICINTERPRETATION_VALUES, EXIF_RESOLUTIONUNIT_VALUES
from ocrd.processor.base import Processor
from ocrd.model.ocrd_page import OcrdPage
class ExifProcessor(Processor):
"""
Extracts image meta data.
"""
def verify(self):
"""
Ensure that the output is only pages
"""
return True
def process(self):
"""
Performs the image characterization.
"""
with exiftool.ExifTool() as et:
for input_file in self.workspace.mets.find_files(fileGrp='INPUT'):
from __future__ import absolute_import
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('processor.segment_line.tesserocr')
class Tesseract3LineSegmenter(Processor):
def process(self):
"""
Performs the line segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image_url = page.imageFileName
for region in page.list_textregions():
log.debug("Detecting lines in %s with tesseract", region)
image = self.workspace.resolve_image_as_pil(image_url, region.coords)
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
region.add_textline(coords=component[1])
self.add_output_file(
from __future__ import absolute_import
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('processor.Tesseract3Recognizer')
DEFAULT_MODEL = tesserocr.get_languages()[1][-1]
class Tesseract3Recognizer(Processor):
def process(self):
"""
Performs the (text) recognition.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file)
self.workspace.download_file(input_file)
page = OcrdPage.from_file(input_file)
image_url = page.imageFileName
log.info("page %s", page)
for region in page.list_textregions():
textlines = region.list_textlines()