Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(link_dir, 'git')):
return False
is_clonable_url = (
(domain(link.url) in GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return FETCH_GIT
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'git')):
return False
is_clonable_url = (
(domain(link.url) in GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return SAVE_GIT
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False
if is_static_file(link.url):
return False
return FETCH_TITLE
def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SUBMIT_ARCHIVE_DOT_ORG
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
if is_static_file(link.url):
return without_scheme(without_fragment(link.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc?v=zzVa_tX1OiI.html
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'media')):
return False
return SAVE_MEDIA
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
return False
return SAVE_PDF
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
return False
return SAVE_SCREENSHOT
def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
return False
return FETCH_PDF