Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_start_sentry_listener():
spark_context = SparkContext.getOrCreate()
gateway = spark_context._gateway
assert gateway._callback_server is None
_start_sentry_listener(spark_context)
assert gateway._callback_server is not None
# This is placed here because otherwise it causes an error when used in a spark slave.
from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
# This reads from default.ini and then xframes/config.ini
# if they exist.
self._env = Environment.create()
context = create_spark_config(self._env)
verbose = self._env.get_config('xframes', 'verbose', 'false').lower() == 'true'
hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs')
os.environ['HADOOP_USER_NAME'] = hdfs_user_name
config_pairs = [(k, v) for k, v in context.iteritems()]
self._config = (SparkConf().setAll(config_pairs))
if verbose:
print 'Spark Config: {}'.format(config_pairs)
self._sc = SparkContext(conf=self._config)
self._sqlc = SQLContext(self._sc)
self._hivec = HiveContext(self._sc)
self.zip_path = []
version = [int(n) for n in self._sc.version.split('.')]
self.status_tracker = self._sc.statusTracker()
if cmp(version, [1, 4, 1]) >= 0:
self.application_id = self._sc.applicationId
else:
self.application_id = None
if verbose:
print 'Spark Version: {}'.format(self._sc.version)
if self.application_id:
print 'Application Id: {}'.format(self.application_id)
if not context['spark.master'].startswith('local'):
tmp = t2 / (1 + t2) * label
for i in range(0, D):
grad += [features[i] * tmp]
return grad
def mySum(a, b):
for i in range (0, D):
a[i] += b[i]
return a
if __name__ == "__main__":
if len(sys.argv) != 3:
print >> sys.stderr, "Usage: sparkLR "
exit(-1)
sc = SparkContext(appName="pysparkLR")
points = sc.textFile(sys.argv[1]).map(parsePoint).persist()
N = points.count()
iterations = int(sys.argv[2])
for i in range(0, iterations):
gradient = points.map(logisticLossGradient).reduce(mySum)
for j in range(0, D):
weights[j] -= gradient[j] / (N * sqrt(i + 1))
# format and output weights to stdout
line = str(weights[0])
for i in range (1, D):
line += " " + str(weights[i])
sys.stdout.write(line + "\n")
sc.stop()
def getSparkSessionInstance(sparkConf):
if ('sparkSessionSingletonInstance' not in globals()):
globals()['sparkSessionSingletonInstance'] = SparkSession\
.builder\
.config(conf=sparkConf)\
.getOrCreate()
return globals()['sparkSessionSingletonInstance']
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: sql_network_wordcount.py ", file=sys.stderr)
exit(-1)
host, port = sys.argv[1:]
sc = SparkContext(appName="PythonSqlNetworkWordCount")
ssc = StreamingContext(sc, 1)
# Create a socket stream on target ip:port and count the
# words in input stream of \n delimited text (eg. generated by 'nc')
lines = ssc.socketTextStream(host, int(port))
words = lines.flatMap(lambda line: line.split(" "))
# Convert RDDs of the words DStream to DataFrame and run SQL query
def process(time, rdd):
print("========= %s =========" % str(time))
try:
# Get the singleton instance of SparkSession
spark = getSparkSessionInstance(rdd.context.getConf())
# Convert RDD[String] to RDD[Row] to DataFrame
parser.add_argument("-it", "--input_topic", help="input kafka topic", type=str, required=True)
parser.add_argument("-oz", "--output_zookeeper", help="output zookeeper hostname:port", type=str, required=True)
parser.add_argument("-ot", "--output_topic", help="output kafka topic", type=str, required=True)
# Parse arguments.
args = parser.parse_args()
# Set variables
application_name = os.path.basename(sys.argv[0]) # Application name used as identifier
kafka_partitions = 1 # Number of partitions of the input Kafka topic
window_duration = 10 # Analysis window duration (10 seconds)
window_slide = 10 # Slide interval of the analysis window (10 seconds)
# Spark context initialization
sc = SparkContext(appName=application_name + " " + " ".join(sys.argv[1:])) # Application name used as the appName
ssc = StreamingContext(sc, 1) # Spark microbatch is 1 second
# Initialize input DStream of flows from specified Zookeeper server and Kafka topic
input_stream = KafkaUtils.createStream(ssc, args.input_zookeeper, "spark-consumer-" + application_name,
{args.input_topic: kafka_partitions})
# Parse flows in the JSON format
input_stream_json = input_stream.map(lambda x: json.loads(x[1]))
# Process data to the defined function.
hourly_host_statistics = collect_hourly_stats(input_stream_json)
daily_host_statistics = collect_daily_stats(hourly_host_statistics)
kafka_producer = KafkaProducer(bootstrap_servers=args.output_zookeeper,
client_id="spark-producer-" + application_name)
This script serves as an example of using drudge for complex symbolic
manipulations. The derivation here is going to be based on the approach in GE
Scuseria et al, J Chem Phys 89 (1988) 7382 (10.1063/1.455269).
"""
from pyspark import SparkConf, SparkContext
from sympy import IndexedBase, Rational, symbols
from drudge import RestrictedPartHoleDrudge, Stopwatch
# Environment setting up.
conf = SparkConf().setAppName('rccsd')
ctx = SparkContext(conf=conf)
dr = RestrictedPartHoleDrudge(ctx)
dr.full_simplify = False
p = dr.names
e_ = p.e_
a, b, c, d = p.V_dumms[:4]
i, j, k, l = p.O_dumms[:4]
#
# Cluster excitation operator
#
# Here, we first write the cluster excitation operator in terms of the
# unitary group generator. Then they will be substituted by their fermion
# operator definition.
#
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
sc = SparkContext('local')
denseVec1 = LabeledPoint(1.0, Vectors.dense([3.0,5.0,1.0]))
denseVec2 = LabeledPoint(0.0, Vectors.dense([2.0, 0.0, 1.0]))
vectors = [denseVec1, denseVec2]
points = sc.parallelize(vectors)
print(points)
# 设置决策树参数,训练模型
numClasses = 3
categoricalFeaturesInfo = {}
numTrees = 3
featureSubsetStrategy = "auto"
impurity = "gini"
maxDepth = 5
maxBins = 32
seed = 5
decisionTreeModel = RandomForest.trainClassifier(points, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
def main():
'''This class creates a dataset of sequence segment derived from a
non-redundant set. The dataset contains the sequence segment, the DSSP
Q8 and DSSP Q3 code of the center residue in a sequence segment, and a
Word2Vec encoding of the sequence segment.
The Data is saved in JSON file specified by the user.
'''
start = time.time()
conf = SparkConf() \
.setMaster("local[*]") \
.setAppName("secondaryStructureWord2VecEncodeDemo")
sc = SparkContext(conf=conf)
# Read MMTF Hadoop sequence file and create a non-redundant set
# (<=20% seq. identity) of L-protein chains
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123
pdb = mmtfReader \
.read_sequence_file(path, sc) \
.flatMap(StructureToPolymerChains()) \
.filter(Pisces(sequenceIdentity, resolution)) \
.filter(ContainsLProteinChain()) \
os.path.join(os.getenv('HOME'), 'spark_exec_home'))
spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
spConf.set("spark.executor.memory", memoryPerExecutor)
print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
try:
sparkMaster = SparkMasterOverride
except:
pass
if sparkMaster[:5] == "mesos":
spConf.set("spark.cores.max", numExecutors)
else:
# Spark master is YARN or local[N]
spConf.set("spark.executor.instances", numExecutors)
spConf.set("spark.executor.cores", coresPerExecutor)
spConf.setMaster(sparkMaster)
sc = SparkContext(conf=spConf)
return sc, numExecutors, numPartitions
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from cqlengine import connection
from cqlengine.connection import get_session
import sys
file=sys.argv[1]
conf = SparkConf().setAppName("Finance News, Batch Trades").set("spark.cores.max", "120")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
json_format = [StructField("user", StringType(), True),
StructField("company", StringType(), True),
StructField("numstock", IntegerType(), True),
StructField("timestamp", StringType(), True)]
df = sqlContext.read.json("hdfs://ec2-54-215-247-116.us-west-1.compute.amazonaws.com:9000" + file, StructType(json_format))
#calculate current stock count holdings for each user and company
df.registerTempTable("trade_history")
df_stockcount = sqlContext.sql("SELECT user AS stockcount_user, company, SUM(numstock) AS stock_total FROM trade_history WHERE timestamp IS NOT NULL GROUP BY user, company")
#sum total portfolio stock by user
df_stockcount.registerTempTable("stockcount")
df_totalportfolio = sqlContext.sql("SELECT stockcount_user AS totalportfolio_user, SUM(ABS(stock_total)) AS portfolio_total FROM stockcount GROUP BY stockcount_user")