How to use archivebox - 10 common examples

To help you get started, we’ve selected a few archivebox examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pirate / ArchiveBox / archivebox / tests.py View on Github external
def test_conflicting_init(self):
        with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
            f.write('test')

        try:
            with output_hidden(show_failing=False):
                archivebox_init.main([])
            assert False, 'Init should have exited with an exception'
        except SystemExit:
            pass

        assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
        assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
        assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
        try:
            load_main_index(out_dir=OUTPUT_DIR)
            assert False, 'load_main_index should raise an exception when no index is present'
        except:
            pass
github pirate / ArchiveBox / archivebox / tests.py View on Github external
def test_remove_regex(self):
        with output_hidden():
            archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])

        all_links = load_main_index(out_dir=OUTPUT_DIR)
        assert len(all_links) == 4
github pirate / ArchiveBox / archivebox / tests.py View on Github external
def test_add_stdin_url(self):
        with output_hidden():
            archivebox_add.main([], stdin=test_urls)

        all_links = load_main_index(out_dir=OUTPUT_DIR)
        assert len(all_links) == 12
github pirate / ArchiveBox / archivebox / tests.py View on Github external
def test_remove_exact(self):
        with output_hidden():
            archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])

        all_links = load_main_index(out_dir=OUTPUT_DIR)
        assert len(all_links) == 11
github pirate / ArchiveBox / archivebox / archive_methods.py View on Github external
@enforce_types
def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download full site using git"""

    link_dir = link_dir or link.link_dir
    output: ArchiveOutput = 'git'
    output_path = os.path.join(link_dir, str(output))
    os.makedirs(output_path, exist_ok=True)
    cmd = [
        GIT_BINARY,
        'clone',
        '--mirror',
        '--recursive',
        *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
        without_query(without_fragment(link.url)),
    ]
    status = 'succeeded'
github pirate / ArchiveBox / archivebox / parsers / __init__.py View on Github external
@enforce_types
def save_file_to_sources(path: str, timeout: int=TIMEOUT, out_dir: str=OUTPUT_DIR) -> str:
    """download a given url's content into output/sources/domain-.txt"""
    check_data_folder(out_dir=out_dir)

    sources_dir = os.path.join(out_dir, SOURCES_DIR_NAME)
    if not os.path.exists(sources_dir):
        os.makedirs(sources_dir)

    ts = str(datetime.now().timestamp()).split('.', 1)[0]

    source_path = os.path.join(sources_dir, '{}-{}.txt'.format(basename(path), ts))

    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        source_path = os.path.join(sources_dir, '{}-{}.txt'.format(domain(path), ts))
        print('{}[*] [{}] Downloading {}{}'.format(
            ANSI['green'],
github pirate / ArchiveBox / archivebox / main.py View on Github external
elif set:
        new_config = {}
        failed_options = []
        for line in config_options:
            if line.startswith('#') or not line.strip():
                continue
            if '=' not in line:
                stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
                stderr(f'    {line}')
                raise SystemExit(2)

            raw_key, val = line.split('=')
            raw_key = raw_key.upper().strip()
            key = get_real_name(raw_key)
            if key != raw_key:
                stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')

            if key in CONFIG:
                new_config[key] = val.strip()
            else:
                failed_options.append(line)

        if new_config:
            before = CONFIG
            matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
            after = load_all_config()
            print(printable_config(matching_config))

            side_effect_changes: ConfigDict = {}
            for key, val in after.items():
                if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
                    side_effect_changes[key] = after[key]
github pirate / ArchiveBox / archivebox / main.py View on Github external
stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
            raise SystemExit(1)

        cron = dedupe_cron_jobs(cron)
        cron.write()

        total_runs = sum(j.frequency_per_year() for j in cron)
        existing_jobs = list(cron.find_comment(CRON_COMMENT))

        print()
        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
        if total_runs > 60 and not quiet:
            stderr()
            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
            stderr()
            stderr('    Make sure you have enough storage space available to hold all the data.')
            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
        raise SystemExit(0)
github pirate / ArchiveBox / archivebox / main.py View on Github external
stderr('        archivebox init --every=day https://example.com/some/rss/feed.xml')
            stderr('        archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
            raise SystemExit(1)

        cron = dedupe_cron_jobs(cron)
        cron.write()

        total_runs = sum(j.frequency_per_year() for j in cron)
        existing_jobs = list(cron.find_comment(CRON_COMMENT))

        print()
        print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
        print('\n'.join(f'  > {cmd}' if str(cmd) == str(new_job) else f'    {cmd}' for cmd in existing_jobs))
        if total_runs > 60 and not quiet:
            stderr()
            stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
            stderr(f'    Congrats on being an enthusiastic internet archiver! 👌')
            stderr()
            stderr('    Make sure you have enough storage space available to hold all the data.')
            stderr('    Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
        raise SystemExit(0)
github pirate / ArchiveBox / archivebox / main.py View on Github external
print('    {}'.format(printable_config(side_effect_changes, prefix='    ')))
        if failed_options:
            stderr()
            stderr('[X] These options failed to set:', color='red')
            stderr('    {}'.format('\n    '.join(failed_options)))
        raise SystemExit(bool(failed_options))
    elif reset:
        stderr('[X] This command is not implemented yet.', color='red')
        stderr('    Please manually remove the relevant lines from your config file:')
        stderr(f'        {CONFIG_FILE}')
        raise SystemExit(2)
    else:
        stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
        stderr('    archivebox config')
        stderr('    archivebox config --get SOME_KEY')
        stderr('    archivebox config --set SOME_KEY=SOME_VALUE')
        raise SystemExit(2)