Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_get_plaintext_document_body(tmpdir):
input = [u"Some text\n", u"on multiple lines\n"]
f = tmpdir.join("plain.txt")
f.write("".join(input))
assert input == get_plaintext_document_body(str(f))
with pytest.raises(UnknownDocumentTypeError) as excinfo:
html = "Some page"
f = tmpdir.join("page.html")
f.write(html)
get_plaintext_document_body(str(f))
assert 'text/html' in excinfo.value.args
If you want to also link each reference to some other resource (like a record),
you can provide a linker_callback function to be executed for every reference
element found.
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})
"""
if not os.path.isfile(path):
raise FullTextNotAvailableError(u"File not found: '{0}'".format(path))
docbody = get_plaintext_document_body(path)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
if not reflines:
docbody = get_plaintext_document_body(path, keep_layout=True)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
parsed_refs, stats = parse_references(
reflines,
recid=recid,
reference_format=reference_format,
linker_callback=linker_callback,
override_kbs_files=override_kbs_files,
)
if magic.from_file(path, mime=True) == "application/pdf":
texkeys = extract_texkeys_from_pdf(path)
if len(texkeys) == len(parsed_refs):
parsed_refs = [dict(ref, texkey=[key]) for ref, key in zip(parsed_refs, texkeys)]
return parsed_refs
>>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}")
If you want to also link each reference to some other resource (like a record),
you can provide a linker_callback function to be executed for every reference
element found.
To override KBs for journal names etc., use ``override_kbs_files``:
>>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})
"""
if not os.path.isfile(path):
raise FullTextNotAvailableError(u"File not found: '{0}'".format(path))
docbody = get_plaintext_document_body(path)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
if not reflines:
docbody = get_plaintext_document_body(path, keep_layout=True)
reflines, dummy, dummy = extract_references_from_fulltext(docbody)
parsed_refs, stats = parse_references(
reflines,
recid=recid,
reference_format=reference_format,
linker_callback=linker_callback,
override_kbs_files=override_kbs_files,
)
if magic.from_file(path, mime=True) == "application/pdf":
texkeys = extract_texkeys_from_pdf(path)
if len(texkeys) == len(parsed_refs):