Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
if after is not None and float(link.timestamp) < after:
continue
if before is not None and float(link.timestamp) > before:
continue
if filter_patterns:
if link_matches_filter(link, filter_patterns, filter_type):
yield link
else:
yield link
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
folder: link
for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
if folder not in orphaned_data_dirs
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
if orphan_duplicates or invalid_folders:
print(' For more information about the link data directories that were skipped, run:')
print(' archivebox info')
print(' archivebox list --status=invalid')
print(' archivebox list --status=orphaned')
print(' archivebox list --status=duplicate')
write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
print()
print(' To view your archive index, open:')
print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
print()
print(' To add new links, you can run:')
print(" archivebox add 'https://example.com'")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
}
print(' √ Loaded {} links from existing main index...'.format(len(all_links)))
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(OUTPUT_DIR)
if link.url not in all_links
}
if orphaned_json_links:
all_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
orphaned_sql_links = {
link.url: link
for link in parse_sql_main_index(OUTPUT_DIR)
if link.url not in all_links
}
if orphaned_sql_links:
all_links.update(orphaned_sql_links)
print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(OUTPUT_DIR)
}
orphan_new_links = {
url: link
for url, link in orphaned_data_dir_links.items()
if url not in all_links
}
orphan_duplicates = {
print()
num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
if num_indexed:
print()
print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
print(' archivebox list --status= (e.g. indexed, corrupted, archived, etc.)')
if orphaned:
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
}
orphan_duplicates = {
url: link
for url, link in orphaned_data_dir_links.items()
if url in all_links
}
if orphan_new_links:
all_links.update(orphan_new_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
if orphan_duplicates:
print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
if folder not in orphaned_data_dirs
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
if orphan_duplicates or invalid_folders:
print(' For more information about the link data directories that were skipped, run:')
print(' archivebox info')
print(' archivebox list --status=invalid')
print(' archivebox list --status=orphaned')
print(' archivebox list --status=duplicate')
write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))