Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def session():
yield Session(spark_session=SparkSession.builder.appName("AWS Wrangler Test").getOrCreate())
def main():
# Args
mol_pattern = '/home/emountjoy_statgen/data/sumstats/molecular_trait/*.parquet'
out_dir = '/home/emountjoy_statgen/data/sumstats/molecular_trait_2/'
# mol_pattern = '/Users/em21/Projects/genetics-finemapping/example_data/sumstats/molecular_trait/*.parquet'
# out_dir = '/Users/em21/Projects/genetics-finemapping/example_data/sumstats/molecular_trait_2/'
# Make spark session
spark = (
pyspark.sql.SparkSession.builder
.config("parquet.enable.summary-metadata", "true")
.getOrCreate()
)
print('Spark version: ', spark.version)
# Process each
for inf in glob(mol_pattern):
# Load
df = spark.read.parquet(inf)
# Write
outf = os.path.join(out_dir, os.path.basename(inf))
(
df
.write
# -*- coding: UTF-8 -*-
# Created by thpffcj on 2019/10/19.
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
if __name__ == '__main__':
spark = SparkSession.builder.appName("project").getOrCreate()
data2015 = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("file:///Users/thpffcj/Public/file/Beijing_2015_HourlyPM25_created20160201.csv")\
.select("Year", "Month", "Day", "Hour", "Value", "QC Name")
data2016 = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("file:///Users/thpffcj/Public/file/Beijing_2016_HourlyPM25_created20170201.csv")\
.select("Year", "Month", "Day", "Hour", "Value", "QC Name")
data2017 = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
import argparse
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from lxml import etree
import re
import time
import datetime
import sys
import signal
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
### SO 25407550
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
### Handle command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input", action="append", help="Path to Parquet file containing raw XML. Can supply multiple.")
parser.add_argument("--output", action="store", help="Path in which to store result. Can be local or S3.", default="990_long/parsed")
parser.add_argument("--timestamp", action="store_true", help="If true, append the timestamp to the output path.")
parser.add_argument("--partitions", type=int, action="store", help="Number of partitions to use for XML parsing.", default=500)
parser.add_argument("--timeout", type=int, action="store", help="Number of seconds to spend parsing a single 990 eFile before it is skipped.", default=3)
parser.add_argument("--no-timeout", action="store_true", help="If true, don't time out, even if load is taking a long time.")
args = parser.parse_known_args()[0]
def generate_external_dataset(output_url='file:///tmp/external_dataset'):
"""Creates an example dataset at output_url in Parquet format"""
spark = SparkSession.builder \
.master('local[2]') \
.getOrCreate()
sc = spark.sparkContext
rows_count = 10
rows_rdd = sc.parallelize(range(rows_count)) \
.map(row_generator)
spark.createDataFrame(rows_rdd). \
write. \
mode('overwrite'). \
parquet(output_url)
parser.add_argument('-jdbcTruncate', dest='truncate', action='store')
parser.add_argument('-saveMode', dest='save_mode', action='store')
parser.add_argument('-saveFormat', dest='save_format', action='store')
parser.add_argument('-batchsize', dest='batch_size', action='store')
parser.add_argument('-fetchsize', dest='fetch_size', action='store')
parser.add_argument('-name', dest='name', action='store')
parser.add_argument('-numPartitions', dest='num_partitions', action='store')
parser.add_argument('-partitionColumn', dest='partition_column', action='store')
parser.add_argument('-lowerBound', dest='lower_bound', action='store')
parser.add_argument('-upperBound', dest='upper_bound', action='store')
parser.add_argument('-createTableColumnTypes',
dest='create_table_column_types', action='store')
arguments = parser.parse_args()
# Disable dynamic allocation by default to allow num_executors to take effect.
spark = SparkSession.builder \
.appName(arguments.name) \
.enableHiveSupport() \
.getOrCreate()
if arguments.cmd_type == "spark_to_jdbc":
spark_write_to_jdbc(spark,
arguments.url,
arguments.user,
arguments.password,
arguments.metastore_table,
arguments.jdbc_table,
arguments.jdbc_driver,
arguments.truncate,
arguments.save_mode,
arguments.batch_size,
arguments.num_partitions,
# limitations under the License.
#
"""
Estimator Transformer Param Example.
"""
from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("EstimatorTransformerParamExample")\
.getOrCreate()
# $example on$
# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
# -*- encoding: utf8 -*-
from __future__ import absolute_import, division, print_function, \
unicode_literals
import os
from typing import Dict, Text
from pyspark.sql import DataFrame, SparkSession
from olapy.core.mdx.executor.cube_loader import CubeLoader
spark = SparkSession.builder.appName("olapy").getOrCreate()
class SparkCubeLoader(CubeLoader):
def load_tables(self):
# type: () -> Dict[Text, DataFrame]
"""Load tables from csv files.
:return: tables dict with table name as key and dataframe as value
"""
tables = {}
for file in os.listdir(self.cube_path):
# to remove file extension ".csv"
table_name = os.path.splitext(file)[0]
value = spark.read.csv(
os.path.join(self.cube_path, file),
header=True,
def init_spark(self):
"""
初始化spark
:return:
"""
self.spark = SparkSession.builder \
.master(self.SPARK_MASTER) \
.appName(self.APP_NAME) \
.getOrCreate()
data : dataset
input dataset with column "sequence"
n : int
size of the n-gram
shift : int
start index for the n-gram
outputCol : str
name of the output column
Returns
-------
dataset
output dataset with appended ngram column
'''
session = SparkSession.builder.getOrCreate()
#Encoder function to be passed as User Defined Function (UDF)
def _ngrammer(s):
ngram = []
i,j = 0,0
t = int(len(s)/n)
if len(s) < shift:
return []
s = s[shift:]
while j < t:
ngram.append(s[i: i + n])
j += 1
i += n