python - Spark union of multiple RDDs

Mots clés : pythonapache-sparkpysparkrddpython

meilleur 3 Réponses python - Spark union of multiple RDDs

vote vote

98

rdd1 = sc.parallelize([1, 2, 3]) rdd2 = sc.parallelize([4, 5, 6]) rdd3 = sc.parallelize([7, 8, 9])  rdd = sc.union([rdd1, rdd2, rdd3]) rdd.collect()  ## [1, 2, 3, 4, 5, 6, 7, 8, 9] 
from functools import reduce  # For Python 3.x from pyspark.sql import DataFrame  def unionAll(*dfs):     return reduce(DataFrame.unionAll, dfs)  df1 = sqlContext.createDataFrame([(1, "foo1"), (2, "bar1")], ("k", "v")) df2 = sqlContext.createDataFrame([(3, "foo2"), (4, "bar2")], ("k", "v")) df3 = sqlContext.createDataFrame([(5, "foo3"), (6, "bar3")], ("k", "v"))  unionAll(df1, df2, df3).show()  ## +---+----+ ## |  k|   v| ## +---+----+ ## |  1|foo1| ## |  2|bar1| ## |  3|foo2| ## |  4|bar2| ## |  5|foo3| ## |  6|bar3| ## +---+----+ 
def unionAll(*dfs):     first, *_ = dfs  # Python 3.x, for 2.x you'll have to unpack manually     return first.sql_ctx.createDataFrame(         first.sql_ctx._sc.union([df.rdd for df in dfs]),         first.schema     ) 
vote vote

80

rdd = sc.parallelize([1, 1, 2, 3]) (rdd + rdd).collect() ## [1, 1, 2, 3, 1, 1, 2, 3] 
vote vote

72

first = rdd1.union(rdd2) second = first.union(rdd3) third = second.union(rdd4) ... 
result = rdd1.union(rdd2).union(rdd3).union(rdd4) 

Questions similaires