random42 = sorted(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q','R','S','T','U','V','W','X','Y']) 我需要一个ncr输出有以下三个标签在pyspark代码中的组合
'AAJ','AAB','AAC','AAD','AAE','AAF','AAG','AAH','AAI',‘AAJ’,'AAK','AAL','AAM','AAN','AAO','AAP','AAQ','AAR','AAS','BAA','BAB','BAC','BAD','BAE','BAF','BAG',‘BAV’,'BAI','BAJ','BAK','BAL','BAM','BAN','BAO','BAP','BAQ','BAR','BAS','BAT','BAU','BAV'...'YYT','YYU','YYV','YYW','YYX','YYY‘
>>> rdd = sc.parallelize([1, 2])
>>> sorted(rdd.cartesian(rdd).collect())
[(1, 1), (1, 2), (2, 1), (2, 2)]我需要像这样的三个字母的组合
>>> rdd = sc.parallelize([1, 2, 3, 4, 5])
>>> sorted(rdd.cartesian(rdd).collect())
[(1, 2, 3), (1, 2, 4), (2, 3, 4), (2,3,5)]我在pyspark中寻找,因为我有1000多个元素可以选择1000个C_3
发布于 2019-10-15 01:50:54
这是一个使用spark数据框架实现的示例,将对您有所帮助。主要思想是使用itertools.combinations和udf。
注意:您可以将RDD转换为spark数据帧,以适应我的示例。
import pyspark.sql.functions as F
import itertools
from pyspark.sql.types import StringType, ArrayType
#create a sample data frame
df = spark.createDataFrame([(['A','B','C','D','E'],),(['F','G','H','I','J'],)],('txt',))
df.show()
# +---------------+
# | txt|
# +---------------+
# |[A, B, C, D, E]|
# |[F, G, H, I, J]|
# +---------------+
#udf part
def ncr(x,r):
return [''.join(_) for _ in itertools.combinations(x, r)]
r = 3
udf_ncr = F.udf(lambda t: ncr(t,r), ArrayType(StringType()))
df = df.withColumn('output', udf_ncr('txt'))
df.show(truncate=False)
# +---------------+--------------------------------------------------+
# |txt |output |
# +---------------+--------------------------------------------------+
# |[A, B, C, D, E]|[ABC, ABD, ABE, ACD, ACE, ADE, BCD, BCE, BDE, CDE]|
# |[F, G, H, I, J]|[FGH, FGI, FGJ, FHI, FHJ, FIJ, GHI, GHJ, GIJ, HIJ]|
# +---------------+--------------------------------------------------+Edit:新增rdd实现示例,如下所示:
rdd = sc.parallelize([(['A','B','C','D','E']),(['F','G','H','I','J'])])
def ncr(x,r):
return [''.join(_) for _ in itertools.combinations(x, r)]
r = 3
rdd.map(lambda x: ncr(x,r)).collect()
#[['ABC', 'ABD', 'ABE', 'ACD', 'ACE', 'ADE', 'BCD', 'BCE', 'BDE', 'CDE'],
# ['FGH', 'FGI', 'FGJ', 'FHI', 'FHJ', 'FIJ', 'GHI', 'GHJ', 'GIJ', 'HIJ']]https://stackoverflow.com/questions/58379175
复制相似问题