Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for i in range(len(z)):
u=u+float(z[i])
if(u>1):
return float(u)
else:
return 1.0
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: pagerank ", file=sys.stderr)
sys.exit(-1)
# Initialize the spark context.
spark = SparkSession\
.builder\
.appName("PythonPageRank")\
.getOrCreate()
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
links = lines.map(lambda batsman: parseNeighbors(batsman)).distinct().groupByKey().cache()
links1=lines.map(lambda batsman: parseNeighbors1(batsman)).distinct().groupByKey().cache()
link=links.map(lambda x:(x[0],float(sum1(list(x[1])))))
#print(links.collect())
ranks = link.map(lambda url_neighbors: (url_neighbors[0],url_neighbors[1]))
#print(ranks.collect())
iterations=0
from random import random
from operator import add
from pyspark.sql import SparkSession
import PySparkTestInclude
if __name__ == "__main__":
"""
Usage: pi [partitions]
"""
# Make sure we can include this user-provided module
PySparkTestInclude.func()
spark = SparkSession\
.builder\
.appName("PythonPi")\
.getOrCreate()
partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
n = 100000 * partitions
def f(_):
x = random() * 2 - 1
y = random() * 2 - 1
return 1 if x ** 2 + y ** 2 < 1 else 0
count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))
spark.stop()
from __future__ import print_function
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SparkSession
from pyspark import SparkContext
import numpy as np
sc = SparkContext('local', 'logistic')
spark = SparkSession \
.builder \
.appName("Logistic regression w/ Spark ML") \
.getOrCreate()
BUCKET='BUCKET_NAME'
# read dataset
traindays = spark.read \
.option("header", "true") \
.csv('gs://{}/flights/trainday.csv'.format(BUCKET))
traindays.createOrReplaceTempView('traindays')
from pyspark.sql.types import StringType, FloatType, StructType, StructField
header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'
def __init__(self, I, B):
CNN.__init__(self, I)
self.B = B # number of batches
classifications = load_classifications()
C = len(classifications)
self.init_layers(C)
self.C = C
# create spark context
spark = SparkSession.builder.appName('cnn').getOrCreate()
self.sc = spark.sparkContext
>>> *.JPEG
>>> nZZZZZZZZ/
>>> *.JPEG
:param imagenet_path: a path to the directory containing ``n*/`` subdirectories. If you are running this script on
a Spark cluster, you should have this file be mounted and accessible to executors.
:param output_url: the location where your dataset will be written to. Should be a url: either
``file://...`` or ``hdfs://...``
:param spark_master: A master parameter used by spark session builder. Use default value (``None``) to use system
environment configured spark cluster. Use ``local[*]`` to run on a local box.
:param noun_id_to_text: A dictionary: ``{noun_id : text}``. If ``None``, this function will download the dictionary
from the Internet.
:return: ``None``
"""
session_builder = SparkSession \
.builder \
.appName('Imagenet Dataset Creation') \
.config('spark.executor.memory', '10g') \
.config('spark.driver.memory', '10g') # Increase the memory if running locally with high number of executors
if spark_master:
session_builder.master(spark_master)
spark = session_builder.getOrCreate()
sc = spark.sparkContext
# Get a list of noun_ids
noun_ids = os.listdir(imagenet_path)
if not all(noun_id.startswith('n') for noun_id in noun_ids):
raise RuntimeError('Directory {} expected to contain only subdirectories with name '
'starting with "n".'.format(imagenet_path))
from sparkflow.pipeline_util import PysparkPipelineWrapper
def small_model():
x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
y = tf.placeholder(tf.float32, shape=[None, 10], name='y')
layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer())
layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer())
out = tf.layers.dense(layer2, 10, kernel_initializer=tf.glorot_uniform_initializer())
z = tf.argmax(out, 1, name='out')
loss = tf.losses.softmax_cross_entropy(y, out)
return loss
if __name__ == '__main__':
spark = SparkSession.builder \
.appName("examples") \
.master('local[4]').config('spark.driver.memory', '2g') \
.getOrCreate()
# Read in mnist_train.csv dataset
df = spark.read.option("inferSchema", "true").csv('examples/mnist_train.csv').orderBy(rand())
# Build the tensorflow graph
mg = build_graph(small_model)
# Build the adam optimizer
adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)
# Setup features
vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')
encoder = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import print_function
# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("VectorSlicerExample")\
.getOrCreate()
# $example on$
df = spark.createDataFrame([
Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])
slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])
output = slicer.transform(df)
output.select("userFeatures", "features").show()
# $example off$
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row, SparkSession
"""
A simple text classification pipeline that recognizes "spark" from
input text. This is to show how to create and configure a Spark ML
pipeline in Python. Run with:
bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py
"""
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("SimpleTextClassificationPipeline")\
.getOrCreate()
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
def __init_spark(cls):
cls.spark = SparkSession.builder.appName(SPARK_APP_NAME).getOrCreate()
import mmlspark
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, col
spark = SparkSession.builder.appName("SimpleContServing").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")
print("creating df")
df = spark.readStream.continuousServer() \
.address("0.0.0.0", 8888, "my_api") \
.load() \
.parseRequest(StructType().add("foo", StringType()).add("bar", IntegerType()))
replies = df.withColumn("fooLength", length(col("foo")))\
.makeReply("fooLength")
print("creating server")
server = replies\
.writeStream \
.continuousServer() \