(从前一个问题继续)
我正在尝试部署一个google作业,按照描述这里的方法在google引擎上作为cron作业运行它。
我在管道/script.py文件夹中有一个DataFlow脚本(用python编写)。在本地(使用Apache DirectRunner)或在google (使用DataFlowRunner)上运行此脚本可以正常工作。但是,当将作业部署到应用程序引擎上定期运行它时,作业在执行时会引发以下错误:
(4cb822d7f796239a): Traceback (most recent call last): File
"/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py",
line 582, in do_work
work_executor.execute() File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py",
line 166, in execute
op.start() File "apache_beam/runners/worker/operations.py", line 294, in apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10607)
def start(self): File "apache_beam/runners/worker/operations.py", line 295, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:10501)
with self.scoped_start_state: File "apache_beam/runners/worker/operations.py", line 300, in
apache_beam.runners.worker.operations.DoOperation.start
(apache_beam/runners/worker/operations.c:9702)
pickler.loads(self.spec.serialized_fn)) File "/usr/local/lib/python2.7/dist-
packages/apache_beam/internal/pickler.py", line 225, in loads
return dill.loads(s) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in
loads
return load(file) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in
load
obj = pik.load() File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self) File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name) File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in
find_class
return StockUnpickler.find_class(self, module, name) File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module) ImportError: No module named pipelines.spanner_backup这是在google云控制台的数据流面板中直接访问作业时可以看到的堆栈跟踪。但是,如果我单击"Stack trace“来查看"Stackdriver Reporting”面板中的错误堆栈跟踪,则会看到以下跟踪:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 738, in run
work, execution_context, env=self.environment)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workitem.py", line 130, in get_work_items
work_item_proto.sourceOperationTask.split)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/workercustomsources.py", line 142, in __init__
source_spec[names.SERIALIZED_SOURCE_KEY]['value'])
File "/usr/local/lib/python2.7/dist-packages/apache_beam/internal/pickler.py", line 225, in loads
return dill.loads(s)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 277, in loads
return load(file)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 266, in load
obj = pik.load()
File "/usr/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1090, in load_global
klass = self.find_class(module, name)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 423, in find_class
return StockUnpickler.find_class(self, module, name)
File "/usr/lib/python2.7/pickle.py", line 1124, in find_class
__import__(module)
ImportError: No module named spanner.client在工人之间分享东西的时候暗示了一些重要的错误?谷歌扳手应该是正确安装的,不过。
我正在使用:
Flask==0.12.2
apache-beam[gcp]==2.1.1
gunicorn==19.7.1
gevent==1.2.1
google-cloud-dataflow==2.1.1
google-cloud-spanner==0.26我漏掉了什么吗?
编辑:我的setup.py如下所示:(如这里所述,对应的github链接有注释这里)
from distutils.command.build import build as _build
import subprocess
import setuptools
class build(_build): # pylint: disable=invalid-name
sub_commands = _build.sub_commands + [('CustomCommands', None)]
CUSTOM_COMMANDS = [
['echo', 'Custom command worked!']]
class CustomCommands(setuptools.Command):
"""A setuptools Command class able to run arbitrary commands."""
def initialize_options(self):
pass
def finalize_options(self):
pass
def RunCustomCommand(self, command_list):
print 'Running command: %s' % command_list
p = subprocess.Popen(
command_list,
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Can use communicate(input='y\n'.encode()) if the command run requires
# some confirmation.
stdout_data, _ = p.communicate()
print 'Command output: %s' % stdout_data
if p.returncode != 0:
raise RuntimeError(
'Command %s failed: exit code: %s' % (command_list, p.returncode))
def run(self):
for command in CUSTOM_COMMANDS:
self.RunCustomCommand(command)
REQUIRED_PACKAGES = ["Flask==0.12.2",
"apache-beam[gcp]==2.1.1",
"gunicorn==19.7.1",
"gevent==1.2.1",
"google-cloud-dataflow==2.1.1",
"google-cloud-spanner==0.26"
]
setuptools.setup(
name='dataflow_python_pipeline',
version='1.0.0',
description='DataFlow Python Pipeline',
install_requires=REQUIRED_PACKAGES,
packages=setuptools.find_packages(),
cmdclass={
'build': build,
'CustomCommands': CustomCommands,
}
)发布于 2017-10-24 08:30:43
这是解决我的问题的办法,请记录在案。感谢Marcin Zabloki帮助我。
似乎我没有正确地将安装文件链接到管道。通过替换
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).save_main_session = True
pipeline_options.view_as(SetupOptions).requirements_file = "requirements.txt"
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = '%s/staging' % BUCKET_URL
google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL
pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'通过
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).setup_file = "./setup.py"
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = '%s/staging' % BUCKET_URL
google_cloud_options.temp_location = '%s/tmp' % BUCKET_URL
pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'(并且添加要安装在setup.py文件中而不是requirements.txt中的模块)以及在本地加载在ParDos中使用的模块,而不是在文件的开头,我能够部署脚本。
不这样做似乎会导致一些奇怪的、未定义的行为(例如函数找不到在同一个文件中定义的类),而不是清除错误消息。
https://stackoverflow.com/questions/46807209
复制相似问题