How to use the spacy.load function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MultiPath / Squirrel / tools / reorder.py View on Github external
def reorder(filename, order='l2r', language='en'):

    shutil.copy(filename + '.src', filename + '.{}.src'.format(order))

    fi = open(filename + '.trg')
    fo = open(filename + '.{}.trg'.format(order), 'w')
    fp = open(filename + '.{}.pos'.format(order), 'w')

    if ('common' in order ) or ('rare' in order):
        vocab_index, vocab_freq = pickle.load(open(filename + '.trg.voc.pkl', 'rb'))
        # vocab_freq = {w[0]: w[1] for w in vocab_freq}

    if order == 'dep':  # get the path following the default path of the depdendency tree (ROOT-LEAF, LEFT-RIGHT)
        nlp = spacy.load(language)
        fd = open(filename + '.{}.full'.format(order), 'w')


    for i, line in enumerate(fi):

        if i % 1000 == 0:
            print('processed {} lines.'.format(i))

        words = line.strip().split()
        positions = list(range(1, len(words) + 1))
        eos_pos = len(words) + 1

        if order == 'l2r':
            words = ['
github anuragmishra1 / alter-nlu / processing / preprocessing.py View on Github external
import re
import string
import spacy
from itertools import zip_longest

from nltk.stem.snowball import SnowballStemmer

# Load pretrained model
# nlp = spacy.load('en')
nlp = spacy.load('en', disable=['ner', 'parser'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# Punctuation
tr = str.maketrans(string.punctuation, ' '*len(string.punctuation))
# Stemmer Language
stemmer = SnowballStemmer("english")

# remove punctuation
def remove_punct(val):
    return(re.sub(' +', ' ',val.translate(tr)).strip())


# snowball stemming
def create_stem(val):
    stemmed = ' '.join([stemmer.stem(word) for word in val.split()])
    return(stemmed)
github eellak / gsoc2019-sphinx / email_clustering / helper.py View on Github external
def get_spacy(emails):
    '''Represent emails as vectors using the spacy Greek model.

        Args:
            emails: A list that contains the emails in string format.
        Returns:
            X: A list that contains the vectors of the emails.

        '''
    nlp = spacy.load('el_core_news_md')
    X = []
    for email in emails:
        doc = nlp(email)
        X.append(doc.vector)
    return X
github allenai / scibert / scripts / semeval2017_to_conll2003.py View on Github external
T11	Task 979 1051	accurate recording of the potential noise in the frequencies of interest
        T12	Process 97 114	corrosion process
        T13	Process 136 161	value of noise resistance
        T14	Task 175 199	validate this conclusion
        T15	Material 257 294	pair of nominally identical specimens
        T16	Task 13 33	theoretical analysis

Output:  Entity extraction (CONLL2003 format)

"""

from typing import *

import os
import spacy
nlp = spacy.load('en_core_web_md')

from sci_bert.common.span import Span, TokenSpan, MentionSpan, label_sent_token_spans

# each instance is a single NER[split][instance_id] = {'spans': List, 'labels': List}
NER = {'train': {}, 'dev': {}, 'test': {}}
for split in ['train', 'dev', 'test']:
    print(f'Processing {split}')
    ann_dir = f'semeval2017/{split}/'

    # loop over each instance ID in this split
    instance_ids = sorted({os.path.splitext(ann_file)[0] for ann_file in os.listdir(ann_dir)})
    for id in instance_ids:
        print(f'Processing {id}')
        ann_file = os.path.join(ann_dir, f'{id}.ann')
        txt_file = ann_file.replace('.ann', '.txt')
github NLPatVCU / medaCy / medacy / tools / json_to_pipeline.py View on Github external
def __init__(self):
            super().__init__(
                "custom_pipeline",
                spacy_pipeline=spacy.load(input_json['spacy_pipeline'])
            )

            self.entities = input_json['entities']

            self.spacy_pipeline.tokenizer = self.get_tokenizer()

            self.add_component(GoldAnnotatorComponent, self.entities)

            if 'metamap' in input_json.keys():
                metamap = MetaMap(input_json['metamap'])
                self.add_component(MetaMapComponent, metamap)
github Papich23691 / S.S-Similarity / src / nlp_util.py View on Github external
import spacy
import warnings

'''
To use the more accurate but slower model use "en_core_web_lg"
otherwise use "en_core_web_sm"
'''
nlp = spacy.load("en_core_web_sm")


def tokenize(sentence):
    '''
    Tokenizing a sentence using spaCy model
    And puts them in a list grouped by part of speech
    '''
    doc = nlp(sentence)
    tags = []
    if doc != []:
        tags.append([doc[0].pos_])
        for w in doc:
            if w.pos_ == "PUNCT" or w.lemma_ == "be" or w.pos_ == "DET":
                continue
            istag = False
            for t in tags:
github fractalego / pynsett / pynsett / nl / __init__.py View on Github external
import os
import spacy

from ..auxiliary import tag_is_verb, tag_is_noun, tag_is_adjective
from ..auxiliary import Collator

parser = spacy.load('en_core_web_lg')

_path = os.path.dirname(__file__)


def create_word_nodes(names, words, tags, types, lemmas, head_tokens, entities):
    return [{'name': name,
             'word': word,
             'tag': tag,
             'type': type,
             'lemma': lemma,
             'entity': entity,
             'head_token': head_token}
            for name, word, tag, type, lemma, entity, head_token
            in zip(names, words, tags, types, lemmas, entities, head_tokens)]
github oaqa / knn4qa / scripts / data_convert / text_proc.py View on Github external
:param  spacyMode    a name of the spacy model to use, e.g., en_core_web_sm 
    :param  stopWords    a list of stop words to be excluded (case insensitive); 
                         a token is also excluded when its lemma is in the stop word list.
    :param  removePunct  a bool flag indicating if the punctuation tokens need to be removed
    :param  sentSplit    a bool flag indicating if sentence splitting is necessary
    :param  keepOnlyAlphaNum a bool flag indicating if we need to keep only alpha-numeric characters
    :param  enablePOS    a bool flag that enables POS tagging (which, e.g., can improve lemmatization)
    """

    # Disabling all heavy-weight parsing, but enabling splitting into sentences
    disableList = [SPACY_NER, SPACY_PARSER]
    if not enablePOS:
      disableList.append(SPACY_POS)
    print('Disabled Spacy components: ', disableList)

    self._nlp = spacy.load(spacyModel, disable=disableList)
    if sentSplit:
      sentencizer = self._nlp.create_pipe("sentencizer")
      self._nlp.add_pipe(sentencizer)

    self._removePunct = removePunct
    self._stopWords = frozenset([w.lower() for w in stopWords])
    self._keepOnlyAlphaNum = keepOnlyAlphaNum
    self._lowerCase = lowerCase
github fastai / fastai / old / fastai / text.py View on Github external
def __init__(self, lang='en'):
        self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
        self.tok = spacy.load(lang)
        for w in ('','',''):
            self.tok.tokenizer.add_special_case(w, [{ORTH: w}])
github DragonComputer / Dragonfire / dragonfire / commands / takenote_submodules / check_take_compare2.py View on Github external
# -*- coding: utf-8 -*-

"""
.. module:: check_take_compare
    :platform: Unix
    :synopsis: the top-level submodule of Dragonfire.commands.takenote_submodules that contains the function related to Dragonfire's simple if-else struct of 2. compare of taking note ability.

.. moduleauthor:: Cem Baybars Gรœร‡Lรœ 
"""
import datetime  # Basic date and time types
from random import choice  # Generate pseudo-random numbers
from dragonfire.nlplib import Classifier, Helper  # Submodule of Dragonfire to handle extra NLP tasks

import spacy  # Industrial-strength Natural Language Processing in Python

nlp = spacy.load('en')  # Load en_core_web_sm, English, 50 MB, default model


def is_todo(com, note_taker, user_answering_note, userin, user_prefix):
    """Method to dragonfire's second command struct for checking to do list of taking note ability.

    Args:
        com (str):                 User's command.
        note_taker (object):        note_taker class's object.
        user_answering_note:       User answering string array.
        userin:                    :class:`dragonfire.utilities.TextToAction` instance.
        user_prefix:               user's preferred titles.
    """

    if user_answering_note['isTodo']:
        if not user_answering_note['toDo_listname']:
            user_answering_note['toDo_listname'] = com