Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def reorder(filename, order='l2r', language='en'):
shutil.copy(filename + '.src', filename + '.{}.src'.format(order))
fi = open(filename + '.trg')
fo = open(filename + '.{}.trg'.format(order), 'w')
fp = open(filename + '.{}.pos'.format(order), 'w')
if ('common' in order ) or ('rare' in order):
vocab_index, vocab_freq = pickle.load(open(filename + '.trg.voc.pkl', 'rb'))
# vocab_freq = {w[0]: w[1] for w in vocab_freq}
if order == 'dep': # get the path following the default path of the depdendency tree (ROOT-LEAF, LEFT-RIGHT)
nlp = spacy.load(language)
fd = open(filename + '.{}.full'.format(order), 'w')
for i, line in enumerate(fi):
if i % 1000 == 0:
print('processed {} lines.'.format(i))
words = line.strip().split()
positions = list(range(1, len(words) + 1))
eos_pos = len(words) + 1
if order == 'l2r':
words = ['
import re
import string
import spacy
from itertools import zip_longest
from nltk.stem.snowball import SnowballStemmer
# Load pretrained model
# nlp = spacy.load('en')
nlp = spacy.load('en', disable=['ner', 'parser'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Punctuation
tr = str.maketrans(string.punctuation, ' '*len(string.punctuation))
# Stemmer Language
stemmer = SnowballStemmer("english")
# remove punctuation
def remove_punct(val):
return(re.sub(' +', ' ',val.translate(tr)).strip())
# snowball stemming
def create_stem(val):
stemmed = ' '.join([stemmer.stem(word) for word in val.split()])
return(stemmed)
def get_spacy(emails):
'''Represent emails as vectors using the spacy Greek model.
Args:
emails: A list that contains the emails in string format.
Returns:
X: A list that contains the vectors of the emails.
'''
nlp = spacy.load('el_core_news_md')
X = []
for email in emails:
doc = nlp(email)
X.append(doc.vector)
return X
T11 Task 979 1051 accurate recording of the potential noise in the frequencies of interest
T12 Process 97 114 corrosion process
T13 Process 136 161 value of noise resistance
T14 Task 175 199 validate this conclusion
T15 Material 257 294 pair of nominally identical specimens
T16 Task 13 33 theoretical analysis
Output: Entity extraction (CONLL2003 format)
"""
from typing import *
import os
import spacy
nlp = spacy.load('en_core_web_md')
from sci_bert.common.span import Span, TokenSpan, MentionSpan, label_sent_token_spans
# each instance is a single NER[split][instance_id] = {'spans': List, 'labels': List}
NER = {'train': {}, 'dev': {}, 'test': {}}
for split in ['train', 'dev', 'test']:
print(f'Processing {split}')
ann_dir = f'semeval2017/{split}/'
# loop over each instance ID in this split
instance_ids = sorted({os.path.splitext(ann_file)[0] for ann_file in os.listdir(ann_dir)})
for id in instance_ids:
print(f'Processing {id}')
ann_file = os.path.join(ann_dir, f'{id}.ann')
txt_file = ann_file.replace('.ann', '.txt')
def __init__(self):
super().__init__(
"custom_pipeline",
spacy_pipeline=spacy.load(input_json['spacy_pipeline'])
)
self.entities = input_json['entities']
self.spacy_pipeline.tokenizer = self.get_tokenizer()
self.add_component(GoldAnnotatorComponent, self.entities)
if 'metamap' in input_json.keys():
metamap = MetaMap(input_json['metamap'])
self.add_component(MetaMapComponent, metamap)
import spacy
import warnings
'''
To use the more accurate but slower model use "en_core_web_lg"
otherwise use "en_core_web_sm"
'''
nlp = spacy.load("en_core_web_sm")
def tokenize(sentence):
'''
Tokenizing a sentence using spaCy model
And puts them in a list grouped by part of speech
'''
doc = nlp(sentence)
tags = []
if doc != []:
tags.append([doc[0].pos_])
for w in doc:
if w.pos_ == "PUNCT" or w.lemma_ == "be" or w.pos_ == "DET":
continue
istag = False
for t in tags:
import os
import spacy
from ..auxiliary import tag_is_verb, tag_is_noun, tag_is_adjective
from ..auxiliary import Collator
parser = spacy.load('en_core_web_lg')
_path = os.path.dirname(__file__)
def create_word_nodes(names, words, tags, types, lemmas, head_tokens, entities):
return [{'name': name,
'word': word,
'tag': tag,
'type': type,
'lemma': lemma,
'entity': entity,
'head_token': head_token}
for name, word, tag, type, lemma, entity, head_token
in zip(names, words, tags, types, lemmas, entities, head_tokens)]
:param spacyMode a name of the spacy model to use, e.g., en_core_web_sm
:param stopWords a list of stop words to be excluded (case insensitive);
a token is also excluded when its lemma is in the stop word list.
:param removePunct a bool flag indicating if the punctuation tokens need to be removed
:param sentSplit a bool flag indicating if sentence splitting is necessary
:param keepOnlyAlphaNum a bool flag indicating if we need to keep only alpha-numeric characters
:param enablePOS a bool flag that enables POS tagging (which, e.g., can improve lemmatization)
"""
# Disabling all heavy-weight parsing, but enabling splitting into sentences
disableList = [SPACY_NER, SPACY_PARSER]
if not enablePOS:
disableList.append(SPACY_POS)
print('Disabled Spacy components: ', disableList)
self._nlp = spacy.load(spacyModel, disable=disableList)
if sentSplit:
sentencizer = self._nlp.create_pipe("sentencizer")
self._nlp.add_pipe(sentencizer)
self._removePunct = removePunct
self._stopWords = frozenset([w.lower() for w in stopWords])
self._keepOnlyAlphaNum = keepOnlyAlphaNum
self._lowerCase = lowerCase
def __init__(self, lang='en'):
self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
self.tok = spacy.load(lang)
for w in ('','',''):
self.tok.tokenizer.add_special_case(w, [{ORTH: w}])
# -*- coding: utf-8 -*-
"""
.. module:: check_take_compare
:platform: Unix
:synopsis: the top-level submodule of Dragonfire.commands.takenote_submodules that contains the function related to Dragonfire's simple if-else struct of 2. compare of taking note ability.
.. moduleauthor:: Cem Baybars GรรLร
"""
import datetime # Basic date and time types
from random import choice # Generate pseudo-random numbers
from dragonfire.nlplib import Classifier, Helper # Submodule of Dragonfire to handle extra NLP tasks
import spacy # Industrial-strength Natural Language Processing in Python
nlp = spacy.load('en') # Load en_core_web_sm, English, 50 MB, default model
def is_todo(com, note_taker, user_answering_note, userin, user_prefix):
"""Method to dragonfire's second command struct for checking to do list of taking note ability.
Args:
com (str): User's command.
note_taker (object): note_taker class's object.
user_answering_note: User answering string array.
userin: :class:`dragonfire.utilities.TextToAction` instance.
user_prefix: user's preferred titles.
"""
if user_answering_note['isTodo']:
if not user_answering_note['toDo_listname']:
user_answering_note['toDo_listname'] = com