我正在测试Google云功能。目标是将超过10MB的数据传输到大型查询表中。我已经将数据分成块,然后使用该块将数据加载到大型查询中。分配的内存为2 2GB。然而,它不起作用。
from bigquery_uploader.datauploader import Datauploader, bq_streaming_cloud_function_table_schema
import sys
def upload_bigquery_chunks(data_list, bigquery_extractor, bq_table_id, chunk_mb=4):
def getChunks(data_list, n=20):
for i in range(0, len(data_list), n):
yield data_list[i: i + n]
avg_size_dd = sys.getsizeof(data_list) / len(data_list)
num_chunks = int(chunk_mb * (10 ** 6) / avg_size_dd)
data_chunks = getChunks(data_list, num_chunks)
for data in data_chunks:
print(sys.getsizeof(data))
bigquery_extractor.client.insert_rows_json(bq_table_id, data)
def bq_data_uploader_stream(request):
bq_project_id = 'my_project_id'
bq_dataset = 'my_dataset'
bq_table_id = "bq_streaming_cloud_function_table"
data_list = list()
# 1MB = 1 * (10**6)
MBs = 10
while sys.getsizeof(data_list) < MBs * (10 ** 6):
data_list.append({'campaign_name': 'Google Cloud Function', 'campaign_id': 123456789})
bq_extractor = Datauploader(bq_project_id)
bq_extractor.create_table(bq_dataset, bq_table_id, bq_streaming_cloud_function_table_schema)
bq_table_id = '{0}.{1}.{2}'.format(bq_project_id, bq_dataset, bq_table_id)
upload_bigquery_chunks(data_list, bq_extractor, bq_table_id)我得到以下错误。
Traceback (most recent call last):
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 346, in run_http_function
result = _function_handler.invoke_user_function(flask.request)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 217, in invoke_user_function
return call_user_function(request_or_event)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 210, in call_user_function
return self._user_function(request_or_event)
File "/user_code/main.py", line 34, in bq_data_uploader_stream
upload_bigquery_chunks(data_list, bq_extractor, bq_table_id)
File "/user_code/main.py", line 16, in upload_bigquery_chunks
bigquery_extractor.client.insert_rows_json(bq_table_id, data)
File "/env/local/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 2269, in insert_rows_json
retry, method="POST", path="%s/insertAll" % table.path, data=data
File "/env/local/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 476, in _call_api
return call()
File "/env/local/lib/python3.7/site-packages/google/api_core/retry.py", line 277, in retry_wrapped_func
on_error=on_error,
File "/env/local/lib/python3.7/site-packages/google/api_core/retry.py", line 182, in retry_target
return target()
File "/env/local/lib/python3.7/site-packages/google/cloud/_http.py", line 393, in api_request
raise exceptions.from_http_response(response)
google.api_core.exceptions.BadRequest: 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/my_project_id/datasets/my_dataset/tables/bq_streaming_cloud_function_table/insertAll: Request payload size exceeds the limit: 10485760 bytes.发布于 2019-11-13 01:02:29
发布于 2019-11-13 00:08:22
正如您在Bigquery documentation中看到的,每个HTTP请求有10MB的限制。如果您将数据分成4mb的块,但仍然发送超过10mb或3个块的数据,您仍将达到此限制。
尝试限制流,或者作为一种可能的解决方法,您可以尝试将数据分块上传到云存储,然后直接从存储桶执行导入。
https://stackoverflow.com/questions/58817150
复制相似问题