How to use the pyspark.SparkContext function in pyspark

To help you get started, we’ve selected a few pyspark examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github getsentry / sentry-python / tests / integrations / spark / test_spark.py View on Github external
def test_start_sentry_listener():
    spark_context = SparkContext.getOrCreate()

    gateway = spark_context._gateway
    assert gateway._callback_server is None

    _start_sentry_listener(spark_context)

    assert gateway._callback_server is not None
github VeritoneAlpha / xpatterns-xframe / xframes / spark_context.py View on Github external
# This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config('xframes', 'verbose', 'false').lower() == 'true'
        hdfs_user_name = self._env.get_config('webhdfs', 'user', 'hdfs')
        os.environ['HADOOP_USER_NAME'] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = (SparkConf().setAll(config_pairs))
        if verbose:
            print 'Spark Config: {}'.format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split('.')]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print 'Spark Version: {}'.format(self._sc.version)
            if self.application_id:
                print 'Application Id: {}'.format(self.application_id)

        if not context['spark.master'].startswith('local'):
github skale-me / skale / examples / ml / benchmark / sparkLR.py View on Github external
tmp = t2 / (1 + t2) * label
    for i in range(0, D):
        grad += [features[i] * tmp]
    return grad

def mySum(a, b):
    for i in range (0, D):
        a[i] += b[i]
    return a

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: sparkLR  "
        exit(-1)

    sc = SparkContext(appName="pysparkLR")
    points = sc.textFile(sys.argv[1]).map(parsePoint).persist()

    N = points.count()
    
    iterations = int(sys.argv[2])
    for i in range(0, iterations):
        gradient = points.map(logisticLossGradient).reduce(mySum)
        for j in range(0, D):
            weights[j] -= gradient[j] / (N * sqrt(i + 1))
    # format and output weights to stdout  
    line = str(weights[0])
    for i in range (1, D):
        line += " "  + str(weights[i])
    sys.stdout.write(line + "\n")
    sc.stop()
github qubole / spark-on-lambda / examples / src / main / python / streaming / sql_network_wordcount.py View on Github external
def getSparkSessionInstance(sparkConf):
    if ('sparkSessionSingletonInstance' not in globals()):
        globals()['sparkSessionSingletonInstance'] = SparkSession\
            .builder\
            .config(conf=sparkConf)\
            .getOrCreate()
    return globals()['sparkSessionSingletonInstance']


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: sql_network_wordcount.py   ", file=sys.stderr)
        exit(-1)
    host, port = sys.argv[1:]
    sc = SparkContext(appName="PythonSqlNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream(host, int(port))
    words = lines.flatMap(lambda line: line.split(" "))

    # Convert RDDs of the words DStream to DataFrame and run SQL query
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SparkSession
            spark = getSparkSessionInstance(rdd.context.getConf())

            # Convert RDD[String] to RDD[Row] to DataFrame
github CSIRT-MU / Stream4Flow / applications / statistics / hosts_profiling / host_daily_profile.py View on Github external
parser.add_argument("-it", "--input_topic", help="input kafka topic", type=str, required=True)

    parser.add_argument("-oz", "--output_zookeeper", help="output zookeeper hostname:port", type=str, required=True)
    parser.add_argument("-ot", "--output_topic", help="output kafka topic", type=str, required=True)

    # Parse arguments.
    args = parser.parse_args()

    # Set variables
    application_name = os.path.basename(sys.argv[0])  # Application name used as identifier
    kafka_partitions = 1  # Number of partitions of the input Kafka topic
    window_duration = 10  # Analysis window duration (10 seconds)
    window_slide = 10  # Slide interval of the analysis window (10 seconds)

    # Spark context initialization
    sc = SparkContext(appName=application_name + " " + " ".join(sys.argv[1:]))  # Application name used as the appName
    ssc = StreamingContext(sc, 1)  # Spark microbatch is 1 second

    # Initialize input DStream of flows from specified Zookeeper server and Kafka topic
    input_stream = KafkaUtils.createStream(ssc, args.input_zookeeper, "spark-consumer-" + application_name,
                                           {args.input_topic: kafka_partitions})

    # Parse flows in the JSON format
    input_stream_json = input_stream.map(lambda x: json.loads(x[1]))

    # Process data to the defined function.
    hourly_host_statistics = collect_hourly_stats(input_stream_json)
    daily_host_statistics = collect_daily_stats(hourly_host_statistics)

    kafka_producer = KafkaProducer(bootstrap_servers=args.output_zookeeper,
                                   client_id="spark-producer-" + application_name)
github tschijnmo / drudge / docs / examples / rccsd.py View on Github external
This script serves as an example of using drudge for complex symbolic
manipulations.  The derivation here is going to be based on the approach in GE
Scuseria et al, J Chem Phys 89 (1988) 7382 (10.1063/1.455269).

"""

from pyspark import SparkConf, SparkContext
from sympy import IndexedBase, Rational, symbols

from drudge import RestrictedPartHoleDrudge, Stopwatch

# Environment setting up.

conf = SparkConf().setAppName('rccsd')
ctx = SparkContext(conf=conf)
dr = RestrictedPartHoleDrudge(ctx)
dr.full_simplify = False

p = dr.names
e_ = p.e_
a, b, c, d = p.V_dumms[:4]
i, j, k, l = p.O_dumms[:4]

#
# Cluster excitation operator
# 
# Here, we first write the cluster excitation operator in terms of the
# unitary group generator.  Then they will be substituted by their fermion
# operator definition.
#
github Jueee / SparkFastDataAnalysis / src / main / java / com / jueee / learnspark / dataanalysis / chapter11 / P53RandomForests.py View on Github external
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

sc = SparkContext('local')
denseVec1 = LabeledPoint(1.0, Vectors.dense([3.0,5.0,1.0]))
denseVec2 = LabeledPoint(0.0, Vectors.dense([2.0, 0.0, 1.0]))
vectors = [denseVec1, denseVec2]
points = sc.parallelize(vectors)
print(points)

# 设置决策树参数,训练模型
numClasses = 3
categoricalFeaturesInfo = {}
numTrees = 3
featureSubsetStrategy = "auto"
impurity = "gini"
maxDepth = 5
maxBins = 32
seed = 5
decisionTreeModel = RandomForest.trainClassifier(points, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
github sbl-sdsc / mmtf-pyspark / demos / ml / secondaryStructureWord2VecEncoder.py View on Github external
def main():
    '''This class creates a dataset of sequence segment derived from a
    non-redundant set. The dataset contains the sequence segment, the DSSP
    Q8 and DSSP Q3 code of the center residue in a sequence segment, and a
    Word2Vec encoding of the sequence segment.
    The Data is saved in JSON file specified by the user.
    '''

    start = time.time()

    conf = SparkConf() \
        .setMaster("local[*]") \
        .setAppName("secondaryStructureWord2VecEncodeDemo")
    sc = SparkContext(conf=conf)

    # Read MMTF Hadoop sequence file and create a non-redundant set
    # (<=20% seq. identity) of L-protein chains

    path = "../../resources/mmtf_reduced_sample/"

    sequenceIdentity = 20
    resolution = 2.0
    fraction = 0.1
    seed = 123

    pdb = mmtfReader \
        .read_sequence_file(path, sc) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain()) \
github apache / incubator-sdap-nexus / climatology / clim / ClimatologySpark2.py View on Github external
os.path.join(os.getenv('HOME'), 'spark_exec_home'))
        spConf.set("spark.executorEnv.PYTHONPATH", os.getcwd())
        spConf.set("spark.executor.memory", memoryPerExecutor)
        print >> sys.stderr, 'memoryPerExecutor = ', memoryPerExecutor
        try:
            sparkMaster = SparkMasterOverride
        except:
            pass
        if sparkMaster[:5] == "mesos":
            spConf.set("spark.cores.max", numExecutors)
        else:
            # Spark master is YARN or local[N]
            spConf.set("spark.executor.instances", numExecutors)
            spConf.set("spark.executor.cores", coresPerExecutor)
            spConf.setMaster(sparkMaster)
        sc = SparkContext(conf=spConf)
    return sc, numExecutors, numPartitions
github andyikchu / insightproject / batch_processing / trades.py View on Github external
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from cqlengine import connection
from cqlengine.connection import get_session
import sys

file=sys.argv[1]
conf = SparkConf().setAppName("Finance News, Batch Trades").set("spark.cores.max", "120")
sc = SparkContext(conf=conf) 
sqlContext = SQLContext(sc) 

json_format = [StructField("user", StringType(), True),
        StructField("company", StringType(), True),
        StructField("numstock", IntegerType(), True),
        StructField("timestamp", StringType(), True)]
df = sqlContext.read.json("hdfs://ec2-54-215-247-116.us-west-1.compute.amazonaws.com:9000" + file, StructType(json_format))

#calculate current stock count holdings for each user and company
df.registerTempTable("trade_history")
df_stockcount = sqlContext.sql("SELECT user AS stockcount_user, company, SUM(numstock) AS stock_total FROM trade_history WHERE timestamp IS NOT NULL GROUP BY user, company")

#sum total portfolio stock by user
df_stockcount.registerTempTable("stockcount")
df_totalportfolio = sqlContext.sql("SELECT stockcount_user AS totalportfolio_user, SUM(ABS(stock_total)) AS portfolio_total FROM stockcount GROUP BY stockcount_user")