How to use the depccg.printer.logger.info function in depccg

To help you get started, we’ve selected a few depccg examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github masashi-y / depccg / depccg / tokens.py View on Github external
def annotate_using_janome(sentences, tokenize=False):
    assert tokenize, 'no support for using janome with pre-tokenized inputs'
    try:
        from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error('failed to import janome. please install it by "pip install janome".')
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = tokenizer.tokenize(sentence)
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
github masashi-y / depccg / depccg / tokens.py View on Github external
def annotate_using_spacy(sentences, tokenize=False, n_threads=2, batch_size=10000):
    try:
        import spacy
        from spacy.tokens import Doc
    except ImportError:
        logger.error('failed to import spacy. please install it by "pip install spacy".')
        exit(1)

    nlp = spacy.load('en', disable=['parser'])
    logger.info('use spacy to annotate POS and NER infos.')

    if tokenize:
        docs = [nlp.tokenizer(' '.join(sentence)) for sentence in sentences]
        raw_sentences = [[str(token) for token in doc] for doc in docs]
    else:
        docs = [Doc(nlp.vocab, sentence) for sentence in sentences]
    for name, proc in nlp.pipeline:
        docs = proc.pipe(docs,
                         n_threads=n_threads,
                         batch_size=batch_size)

    res = []
    for sentence in docs:
        tokens = []
        for token in sentence:
            if token.ent_iob_ == 'O':
github masashi-y / depccg / depccg / tokens.py View on Github external
def annotate_using_jigg(sentences, tokenize=False, pipeline='ssplit,kuromoji'):
    assert tokenize, 'no support for using jigg with pre-tokenized inputs'
    logger.info('use Jigg to tokenize and annotate POS infos.')

    jigg_dir = os.environ.get('JIGG', None)
    if not jigg_dir:
        logger.error('did not find Jigg at JIGG environmental variable. exiting..')
        exit(1)

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    outfile = tempfile.mktemp()
    command = jigg_cmd.format(jigg_dir,
                              pipeline,
                              tmpfile,
                              outfile)
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    command = candc_cmd.format(tmpfile,
                               candc_dir,
                               candc_model_pos,
                               candc_model_ner)
    proc = subprocess.Popen(command,
                            shell=True,
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_dir = os.environ.get('CANDC', None)
    candc_model_pos = None
    candc_model_ner = None
    fail = False
    if candc_dir:
        candc_dir = Path(candc_dir)
        candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
        candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
        if (candc_dir / 'bin' / 'pos').exists() and \
                (candc_dir / 'bin' / 'ner').exists() and \
                candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
        candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
        if (candc_dir / 'bin' / 'pos').exists() and \
                (candc_dir / 'bin' / 'ner').exists() and \
                candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    command = candc_cmd.format(tmpfile,
                               candc_dir,
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
        if (candc_dir / 'bin' / 'pos').exists() and \
                (candc_dir / 'bin' / 'ner').exists() and \
                candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    command = candc_cmd.format(tmpfile,
                               candc_dir,
                               candc_model_pos,
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_dir = os.environ.get('CANDC', None)
    candc_model_pos = None
    candc_model_ner = None
    fail = False
    if candc_dir:
        candc_dir = Path(candc_dir)
        candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
        candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
        if (candc_dir / 'bin' / 'pos').exists() and \
                (candc_dir / 'bin' / 'ner').exists() and \
                candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
github masashi-y / depccg / depccg / tokens.py View on Github external
candc_model_ner.exists():
            pass
        else:
            logger.info('CANDC environmental variable may not be configured correctly.')
            logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    command = candc_cmd.format(tmpfile,
                               candc_dir,
                               candc_model_pos,
                               candc_model_ner)
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,