Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def set_builder(self):
self._builder = builders.TextBuilder()
# Scale up, which oddly helps with OCR
height, width = image.shape[:2]
image = cv2.resize(image, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)
# Increase contrast. Must be done before grayscale conversion
image = cv2utils.increaseContrast(image)
# Convert to grayscale
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Convert to PIL format
pil = Image.fromarray(image)
# OCR the text
txt = self.tool.image_to_string(pil, lang=self.lang, builder=pyocr.builders.TextBuilder())
# Replace any non-ASCII unicode characters with their closest
# equivalents. This is bad news for i18n, but helps us with a lot of
# OCR issues
txt = unicodedata.normalize('NFKD', txt)
lines = txt.split("\n")
# Sometimes OCR will insert extra empty lines, so let's strip them out
newlines = []
for i in range(len(lines)):
if not len(lines[i].strip()) == 0:
newlines.append(lines[i])
lines = newlines
if debug:
for f in os.listdir(path): #Return list of files in path directory
ext = os.path.splitext(f)[1] #Split the pathname path into a pair i.e take .png/ .jpg etc
if ext.lower() not in VALIDITY: #Convert to lowercase and check in validity list
other_files += 1 #Increment if other than validity extension found
continue
else:
count += 1
image_file_name = path + '/' + f #Full /dir/path/filename.extension
txt = tool.image_to_string(
Im.open(image_file_name), lang=self.lang,
builder=pyocr.builders.TextBuilder()
)
#txt = txt.split()[:5]
initial = txt.replace('\a', ' ').replace('\b', ' ').replace('\f', ' ').replace('\n',' ').replace('\r', '').replace('\t',' ').replace('\v',' ') #.replace(' ','_') #.replace('.','_') #Replace \n and \t with space
initial = initial[:60] #Take 1st 100 words
print('Filename:' + initial + '\n')
os.chmod(path, 0o777)
self.savefile(initial, txt, directory_path)
print(str(count) + (" file" if count == 1 else " files") + " processed")
if count + other_files == 0:
print("No files found") #No files found
else :
print(str(count) + " / " + str(count + other_files) + " files converted")
def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
super(PyOCR, self).execute(*args, **kwargs)
image = Image.open(self.converter.get_page())
try:
with c_locale():
result = self.tool.image_to_string(
image,
lang=self.language,
builder=pyocr.builders.TextBuilder()
)
except Exception as exception:
error_message = (
'Exception calling pyocr with language option: {}; {}'
).format(self.language, exception)
if self.language not in self.languages:
error_message = (
'{}\nThe requested OCR language "{}" is not '
'available and needs to be installed.\n'
).format(
error_message, self.language
)
logger.error(error_message)
raise OCRError(error_message)
def ocr_img(self, imgname):
"""
To use a non-standard language pack named foo.traineddata, set the TESSDATA_PREFIX environment variable so the file can be found at TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the argument -l foo.
"""
tools = pyocr.get_available_tools()
tool = tools[0]
text = tool.image_to_string(
Image.open(imgname),
lang='eng',
builder=pyocr.builders.TextBuilder(),
)
print(text)
return text
# alternate approach using pyocr
from PIL import Image
import sys
import pyocr
import pyocr.builders
import codecs
tool = pyocr.get_available_tools()[0]
builder = pyocr.builders.TextBuilder()
txt = tool.image_to_string(
Image.open('file.tiff'),
lang="eng",
builder=builder
)
# txt is a Python string
with codecs.open("toto.txt", 'w', encoding='utf-8') as file_descriptor:
builder.write_file(file_descriptor, txt)
# toto.txt is a simple text file, encoded in utf-8
def main(self, text_img_name):
txt = tool.image_to_string(
Im.open(text_img_name), lang=self.lang,
builder=pyocr.builders.TextBuilder()
)
return txt
def ocr(self, images):
'''Input: images (tuple(area, image))
Returns the results from Tesseract.'''
results = []
for image in images:
txt = self.tool.image_to_string(image[1], lang=self.langs[0], builder=pyocr.builders.TextBuilder())
print("==RESULT==" + str(image[0]) + "\n" + txt + "\n==========================")
results.append((image[0], txt))
return results