Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
logger.info("Importing PDF from '%s'" % (file_uri))
idx = 0
for child in self.fs.recurse(file_uri):
gc.collect()
if not self.check_file_type(child):
continue
h = PdfDoc.hash_file(self.fs, child)
if docsearch.is_hash_in_index(h):
logger.info(
"Document %s already found in the index. Skipped",
child
)
continue
imported.append(child)
doc = PdfDoc(self.fs, docsearch.rootdir)
error = doc.import_pdf(child)
if error:
continue
docs.append(doc)
pages += [p for p in doc.pages]
idx += 1
return ImportResult(
imported_file_uris=imported,
select_doc=doc, new_docs=docs,
new_docs_pages=pages,
stats={
_("PDF"): len(docs),
_("Document(s)"): len(docs),
_("Page(s)"): sum([d.nb_pages for d in docs]),
},
from .util import strip_accents
logger = logging.getLogger(__name__)
COMMAND = collections.namedtuple(
"COMMAND", ["func", "args", "kwargs"]
)
RESULT = collections.namedtuple(
"RESULT", ["exc", "ret"]
)
DOC_TYPE_LIST = [
(is_pdf_doc, PdfDoc.doctype, PdfDoc),
(is_img_doc, ImgDoc.doctype, ImgDoc)
]
class PaperworkIndex(object):
WHOOSH_SCHEMA = whoosh.fields.Schema(
# static up to date schema
docid=whoosh.fields.ID(stored=True, unique=True),
doctype=whoosh.fields.ID(stored=True, unique=False),
docfilehash=whoosh.fields.ID(stored=True),
content=whoosh.fields.TEXT(spelling=True, stored=True),
label=whoosh.fields.KEYWORD(stored=True, commas=True,
scorable=True),
date=whoosh.fields.DATETIME(stored=True),
last_read=whoosh.fields.DATETIME(stored=True),
)
def clone(self):
return PdfDoc(self.fs, self.path, self.docid)
doc = None
docs = []
pages = []
file_uris = [self.fs.safe(uri) for uri in file_uris]
imported = []
for file_uri in file_uris:
logger.info("Importing PDF from '%s'" % (file_uri))
idx = 0
for child in self.fs.recurse(file_uri):
gc.collect()
if not self.check_file_type(child):
continue
h = PdfDoc.hash_file(self.fs, child)
if docsearch.is_hash_in_index(h):
logger.info(
"Document %s already found in the index. Skipped",
child
)
continue
imported.append(child)
doc = PdfDoc(self.fs, docsearch.rootdir)
error = doc.import_pdf(child)
if error:
continue
docs.append(doc)
pages += [p for p in doc.pages]
idx += 1
return ImportResult(
imported_file_uris=imported,