import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import PandasUDFType, pandas_udf
from pyspark.sql.types import *
import os
@pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def split(df, validation_period):
""Logic""
return df
def train_test_split(spark, data_frame, request_json_data):
data_frame = spark.createDataFrame(data_frame)
print(data_frame.schema)
validation_period = request_json_data['validation_period']
groupby_key = request_json_data['groupby_key']
data_frame.groupby(groupby_key).apply(split, validation_period).show()无法调用split函数,出现错误。apply()接受2个位置参数,但给出了3个。我想将validation_period作为参数传递给拆分函数。
发布于 2020-09-21 23:57:17
简短的回答:您不能将额外的参数传递给pandas分组的map udf,因为它只有一个pandas df作为参数。
冗长的答案:还有其他方法可以将validation_period传递给函数
def split_fabric(validation_period):@pandas_udf(模式,模式) def split_fabric(Df):""Logic"“return df
data_frame \ .withColumn("validation_period",F.lit(validation_period)) \ .groupby(groupby_key).apply(split,split
https://stackoverflow.com/questions/63992246
复制相似问题