我有一个2d的python列表,我试图把它写到一个平面的拼图文件中
dcm = [['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,99.99]],
['000076a0-b770-11e7-af3c-618a1ae0aeae', 4, [1.966436237,999.9]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,9999]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,999.9]],
['000071e0-b813-11ea-a1aa-61e10098e853', 3, [2.378136118,99.99]]
]
print("the schema of :",type(dcm))
print("the schema of 0 :", type(dcm[0][0]))
print("the schema of 1 :", type(dcm[0][1]))
print("the schema of 2 :", type(dcm[0][2]))
<class 'list'>
the schema of : <class 'list'>
the schema of 0 : <class 'str'>
the schema of 1 : <class 'numpy.int64'>
the schema of 2 : <class 'list'>我试着把它转换成熊猫数据,然后把它写到地板上,如下所示(键入(Dcm) #列表)
<class 'list'>试着用电火花写作
act_schema = StructType([
StructField('col1', StringType(), True),
StructField('col2', StringType(), True),
StructField('metrics', ArrayType(FloatType()), True)
])
final_output = spark.createDataFrame(data=dcm, schema= act_schema)
n = len(final_output.select(final_output.metrics).first()[0]) # Sometime the last column can have more than one value
dataframeToWrite = final_output.select(final_output.col1, final_output.col2, *(col('metrics').getItem(i).alias(f'col{i}') for i in range(n)))
dataframeToWrite.write.parquet('/user/loc', mode="overwrite")
Error:
*(col('metrics').getItem(i).alias(f'col{i}') for i in range(n)))
TypeError: col() missing 1 required positional argument: 'strg'预期产出
'00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698,99.99
'000076a0-b770-11e7-af3c-618a1ae0aeae', 4, 1.966436237,999.9
'00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698,9999
'00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698,999.9
'000071e0-b813-11ea-a1aa-61e10098e853', 3, 2.378136118,99.99发布于 2021-07-20 00:46:38
我不知道我是否理解问题,但对我来说,它只需要for-loop从行中获取最后一个元素(并删除它),并将两个值分别附加到行中。
dcm = [
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,99.99]],
['000076a0-b770-11e7-af3c-618a1ae0aeae', 4, [1.966436237,999.9]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,9999]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,999.9]],
['000071e0-b813-11ea-a1aa-61e10098e853', 3, [2.378136118,99.99]]
]
for row in dcm:
item = row.pop(-1) # get last item and remove it from list
row += item # append every element from item separatelly
#row.extend(item) # append every element from item separatelly
print(dcm)结果
[
['00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698, 99.99],
['000076a0-b770-11e7-af3c-618a1ae0aeae', 4, 1.966436237, 999.9],
['00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698, 9999],
['00004120-13e4-11eb-874d-637bf9657209', 2, 2.635471698, 999.9],
['000071e0-b813-11ea-a1aa-61e10098e853', 3, 2.378136118, 99.99]
]您的输出看起来像csv,因此可以使用标准模块csv一次性编写。
import csv
with open('output.csv', 'w') as fh:
writer = csv.writer(fh, quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
writer.writerows(dcm)您还可以使用csv或pandas将pyarrow文件转换为parquet文件。
import pyarrow.csv
import pyarrow.parquet
table = pyarrow.csv.read_csv('output.csv')
#print(table.schema)
pyarrow.parquet.write_table(table, 'output-pyarrow.parquet')或者您可以直接将dcm转换为pandas.DataFrame,然后将其保存为parquet文件。
import pandas as pd
df = pd.DataFrame(dcm, columns=['col1', 'col2', 'metric1', 'metric2'])
df.to_parquet('output-pandas.parquet')编辑:
最小工作实例
import csv
dcm = [
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,99.99]],
['000076a0-b770-11e7-af3c-618a1ae0aeae', 4, [1.966436237,999.9]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,9999]],
['00004120-13e4-11eb-874d-637bf9657209', 2, [2.635471698,999.9]],
['000071e0-b813-11ea-a1aa-61e10098e853', 3, [2.378136118,99.99]]
]
for row in dcm:
item = row.pop(-1)
row += item
print('--- dcm ---')
print(dcm)
with open('output.csv', 'w') as fh:
writer = csv.writer(fh) #, quotechar="'", quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(['col1', 'col2', 'metric1', 'metric2'])
writer.writerows(dcm)
# --- pyarrow: from csv ---
import pyarrow.csv
import pyarrow.parquet
table = pyarrow.csv.read_csv('output.csv')
pyarrow.parquet.write_table(table, 'output-pyarrow-csv.parquet')
print('--- schema ---')
print(table.schema)
# --- pandas: from csv ---
import pandas as pd
df = pd.read_csv('output.csv')
df.to_parquet('output-pandas-csv.parquet')
# --- pandas: from data/list ---
import pandas as pd
df = pd.DataFrame(dcm, columns=['col1', 'col2', 'metric1', 'metric2'])
df.to_parquet('output-pandas-data.parquet')
# --- pyarrow: from data/dict ---
import pyarrow.csv
import pyarrow.parquet
import pyarrow as pa
schema = pa.schema({
"col1" : pa.string(),
"col2" : pa.int64(),
"metric1" : pa.float64(),
"metric2" : pa.float64(),
})
columns = {
'col1': [],
'col2': [],
'metric1': [],
'metric2': []
}
for row in dcm:
columns['col1'].append(row[0])
columns['col2'].append(row[1])
columns['metric1'].append(row[2])
columns['metric2'].append(row[3])
table = pyarrow.Table.from_pydict(columns, schema=schema)
pyarrow.parquet.write_table(table, 'output-pyarrow-data.parquet')
print('--- schema ---')
print(table.schema)https://stackoverflow.com/questions/68447974
复制相似问题