How to use the pyocr.builders function in pyocr

To help you get started, we’ve selected a few pyocr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openpaperwork / pyocr / tests / tests_base.py View on Github external
def set_builder(self):
        self._builder = builders.DigitLineBoxBuilder()
github openpaperwork / paperwork / src / paperwork / backend / img / page.py View on Github external
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        boxfile = self.__box_path

        try:
            box_builder = pyocr.builders.LineBoxBuilder()
            with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if boxes != []:
                return boxes
            # fallback: old format: word boxes
            # shouldn't be used anymore ...
            logger.warning("WARNING: Doc %s uses old box format" %
                           (str(self.doc)))
            box_builder = pyocr.builders.WordBoxBuilder()
            with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
                boxes = box_builder.read_file(file_desc)
            return boxes
        except IOError, exc:
            logger.error("Unable to get boxes for '%s': %s"
                         % (self.doc.docid, exc))
            return []
github openpaperwork / paperwork / paperwork-backend / paperwork_backend / pdf / page.py View on Github external
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        if self.__boxes is not None:
            return self.__boxes

        # Check first if there is an OCR file available
        boxfile = self.__get_box_path()
        if self.fs.exists(boxfile):
            box_builder = pyocr.builders.LineBoxBuilder()

            try:
                with self.fs.open(boxfile, 'r') as file_desc:
                    self.__boxes = box_builder.read_file(file_desc)
                return self.__boxes
            except IOError as exc:
                logger.error("Unable to get boxes for '%s': %s"
                             % (self.doc.docid, exc))
                # will fall back on pdf boxes

        # fall back on what libpoppler tells us

        txt = self.pdf_page.get_text()
        self.__boxes = []

        layout = self.pdf_page.get_text_layout()
github openpaperwork / paperwork / src / paperwork / backend / pdf / page.py View on Github external
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        if self.__boxes is not None:
            return self.__boxes

        # Check first if there is an OCR file available
        boxfile = self.__get_box_path()
        try:
            os.stat(boxfile)

            box_builder = pyocr.builders.LineBoxBuilder()

            try:
                with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
                    self.__boxes = box_builder.read_file(file_desc)
                return self.__boxes
            except IOError, exc:
                logger.error("Unable to get boxes for '%s': %s"
                             % (self.doc.docid, exc))
                # will fall back on pdf boxes
        except OSError, exc:  # os.stat() failed
            pass

        # fall back on what libpoppler tells us

        # TODO: Line support !
github mathics / Mathics / mathics / builtin / image.py View on Github external
langs = best_tool.get_available_languages()
        if py_language_code not in langs:
            # if we use Tesseract, then this means copying the necessary language files from
            # https://github.com/tesseract-ocr/tessdatainstalling to tessdata, which is
            # usually located at /usr/share/tessdata or similar, but there's no API to query
            # the exact location, so we cannot, for now, give a better message.

            evaluation.message('TextRecognize', 'lang', py_language, best_tool.get_name())
            return

        import pyocr.builders

        text = best_tool.image_to_string(
            image.pil(),
            lang=py_language_code,
            builder=pyocr.builders.TextBuilder())

        if isinstance(text, (list, tuple)):
            text = '\n'.join(text)

        return String(text)
github camelot-dev / camelot / camelot / ocr.py View on Github external
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
                y_cuts = find_cuts(table_image, char_scale=self.char_scale)
                rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
                table = Table(cols, rows)
                for i in range(len(table.cells)):
                    for j in range(len(table.cells[i])):
                        x1 = int(table.cells[i][j].x1)
                        y1 = int(table.cells[i][j].y1)
                        x2 = int(table.cells[i][j].x2)
                        y2 = int(table.cells[i][j].y2)
                        table.cells[i][j].image = table_image[y1:y2,x1:x2]
                        cell_image = Image.fromarray(table.cells[i][j].image)
                        text = self.tool.image_to_string(
                            cell_image,
                            lang=self.lang,
                            builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
                        )
                        table.cells[i][j].add_text(text)
                ar = table.get_list()
                ar.reverse()
                ar = encode_list(ar)
                table_data['data'] = ar
                tables['table-{0}'.format(table_no + 1)] = table_data
                table_no += 1
        page[os.path.basename(bname)] = tables

        return page