Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, **kwargs):
super().__init__(**kwargs)
from langdetect import DetectorFactory
DetectorFactory.seed = 0
def main(argv):
config = configparser.ConfigParser()
config.read(argv[0])
# Force consistent langdetect results. https://pypi.org/project/langdetect/
DetectorFactory.seed = 0
access_token = config_utils.get_facebook_access_token(config)
commit_to_db_every_n_processed = config.getint('LIMITS', 'BATCH_SIZE', fallback=DEFAULT_BATCH_SIZE)
logging.info('Will commit to DB every %d snapshots processed.', commit_to_db_every_n_processed)
slack_url = config.get('LOGGING', 'SLACK_URL')
database_connection_params = config_utils.get_database_connection_params_from_config(config)
with config_utils.get_database_connection(database_connection_params) as db_connection:
ad_creative_images_bucket_client = make_gcs_bucket_client(AD_CREATIVE_IMAGES_BUCKET,
GCS_CREDENTIALS_FILE)
archive_screenshots_bucket_client = make_gcs_bucket_client(ARCHIVE_SCREENSHOTS_BUCKET,
GCS_CREDENTIALS_FILE)
image_retriever = FacebookAdCreativeRetriever(
db_connection, ad_creative_images_bucket_client, archive_screenshots_bucket_client,
access_token, commit_to_db_every_n_processed, slack_url)
def detect_tweet_language(self, *args):
DetectorFactory.seed = 0
def language_in_tweet(tweet):
detected_lang = None
try:
detected_lang = detect(tweet['text'])
except lang_detect_exception.LangDetectException:
pass
return any([detected_lang in args])
cp = copy.deepcopy(self)
cp.apply_filter_to_collections(language_in_tweet)
return cp
if sys.version >= '3.0':
from urllib.parse import urlparse
else:
from urlparse import urlparse
logger = logging.getLogger(__name__)
# https://github.com/matiasb/python-unidiff/blob/master/unidiff/constants.py#L37
# @@ (source offset, length) (target offset, length) @@ (section header)
RE_HUNK_HEADER = re.compile(
r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))?\ @@[ ]?(.*)$",
flags=re.MULTILINE)
# ensure deterministec language detection
DetectorFactory.seed = 0
MIN_TEXT_LENGTH_FOR_DETECTION = 20
def block_num_from_hash(block_hash):
"""
return the first 4 bytes (8 hex digits) of the block ID (the block_num)
Args:
block_hash (str):
Returns:
int:
"""
return int(str(block_hash)[:8], base=16)
def block_num_from_previous(previous_block_hash):
def tag_reviews_language(self):
print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))
if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
print('Records have already been tagged with language field')
self.records = \
ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
return
DetectorFactory.seed = 0
for record in self.records:
try:
language = langdetect.detect(record[Constants.TEXT_FIELD])
except LangDetectException:
language = 'unknown'
record[Constants.LANGUAGE_FIELD] = language
ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
def detect_language(text):
from langdetect import DetectorFactory, detect
# Stay consistent between builds
DetectorFactory.seed = 0
return detect(text)
def detect_tweet_language(self, *args):
DetectorFactory.seed = 0
def language_in_tweet(tweet):
detected_lang = None
try:
detected_lang = detect(tweet['text'])
except lang_detect_exception.LangDetectException:
pass
return any([detected_lang in args])
cp = copy.deepcopy(self)
cp.collection.set_custom_filter(language_in_tweet)
return cp
* Python wrapper: https://pypi.python.org/pypi/langdetect
* Based on: https://github.com/shuyo/language-detection
"""
# Core Library modules
from typing import Any, Dict, List
# Third party modules
import click
import pkg_resources
from langdetect import DetectorFactory, detect, detect_langs
# First party modules
import lidtk.classifiers
DetectorFactory.seed = 0 # Make sure we get consistent results
class LangdetectClassifier(lidtk.classifiers.LIDClassifier):
"""LID with the Langdetect classifier."""
def predict(self, text: str) -> str:
"""
Predicting the language of a text.
Parameters
----------
text : str
"""
return self.map2wili(detect(text))
def predict_proba(self, text: str) -> List[Dict[str, Any]]:
# Review polarity tag, i.e. either "recommended" or "not recommended"
is_a_positive_review = review['voted_up']
# Review text
review_content = review['review']
# Review language tag
review_language_tag = review['language']
# Review's automatically detected language
if review_id in previously_detected_languages_dict[app_id].keys():
detected_language = previously_detected_languages_dict[app_id][review_id]
else:
try:
DetectorFactory.seed = 0
detected_language = detect(review_content)
except lang_detect_exception.LangDetectException:
detected_language = 'unknown'
previously_detected_languages_dict[app_id][review_id] = detected_language
previously_detected_languages_dict['has_changed'] = True
language_dict[review_id] = dict()
language_dict[review_id]['tag'] = review_language_tag
language_dict[review_id]['detected'] = detected_language
language_dict[review_id]['voted_up'] = is_a_positive_review
return language_dict, previously_detected_languages_dict
from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory
# https://github.com/Mimino666/langdetect
# to ensure deterministic behaviour
DetectorFactory.seed = 0
print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")
print detect_langs("Otec matka syn.")