首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >Google Vertex AI超参数调整: 500遇到内部错误

Google Vertex AI超参数调整: 500遇到内部错误
EN

Stack Overflow用户
提问于 2021-06-11 14:48:13
回答 2查看 199关注 0票数 0

我试图使用Python SDK描述的here在Vertex AI上运行一个超参数调优作业。大约2小时前,它成功地将作业发送到运行。我注意到我的代码中有一些错误,所以运行失败了,我回去修复它,然后重新运行代码,得到的结果如下所示。

代码语言:javascript
复制
  Traceback (most recent call last):
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/grpc_helpers.py", line 67, in error_remapped_callable
    return callable_(*args, **kwargs)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
        status = StatusCode.INTERNAL
        details = "Internal error encountered."
        debug_error_string = "{"created":"@1623393121.374988331","description":"Error received from peer ipv4:142.251.33.74:443","file":"src/core/lib/surface/call.cc","file_line":1066,"grpc_message":"Internal error encountered.","grpc_status":13}"
>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspace/ariel_ml_2021/hparam_tuning.py", line 140, in <module>
    create_hyperparameter_tuning_job_python_package()
  File "/workspace/ariel_ml_2021/hparam_tuning.py", line 133, in create_hyperparameter_tuning_job_python_package
    response = client.create_hyperparameter_tuning_job(
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/cloud/aiplatform_v1/services/job_service/client.py", line 1363, in create_hyperparameter_tuning_job
    response = rpc(request, retry=retry, timeout=timeout, metadata=metadata,)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/gapic_v1/method.py", line 145, in __call__
    return wrapped_func(*args, **kwargs)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/google/api_core/grpc_helpers.py", line 69, in error_remapped_callable
    six.raise_from(exceptions.from_grpc_error(exc), exc)
  File "<string>", line 3, in raise_from
google.api_core.exceptions.InternalServerError: 500 Internal error encountered.

我认为这可能是由于我在python代码中所做的更改导致了错误,所以我尝试了原始副本(没有做任何更改),但错误仍然存在。如果需要,调整超参数的代码如下所示。

代码语言:javascript
复制
from google.cloud import aiplatform


def create_hyperparameter_tuning_job_python_package(
    project: str = "<my_project_id>",
    display_name: str = "<some_description>",
    executor_image_uri: str = "us-docker.pkg.dev/vertex-ai/training/tf-cpu.2-4:latest",
    package_uri: str = "gs://<bucket_name>/",
    python_module: str = "train_second",  # located at gs://<bucket_name>/train_second.py
    location: str = "us-central1",
    api_endpoint: str = "us-central1-aiplatform.googleapis.com",
):
    client_options = {"api_endpoint": api_endpoint}
    client = aiplatform.gapic.JobServiceClient(client_options=client_options)

    metric = {
        "metric_id": "ariel_score",
        "goal": aiplatform.gapic.StudySpec.MetricSpec.GoalType.MAXIMIZE,
    }

    conditional_param_H1 = {
        "parameter_spec": {
            "parameter_id": "H1",
            "discrete_value_spec": {"values": [4, 8, 16, 32, 64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_H2 = {
        "parameter_spec": {
            "parameter_id": "H2",
            "discrete_value_spec": {"values": [64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }


    conditional_param_H3 = {
        "parameter_spec": {
            "parameter_id": "H3",
            "discrete_value_spec": {"values": [4, 8, 16, 32, 64, 128, 256, 512, 1024]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_D1 = {
        "parameter_spec": {
            "parameter_id": "D1",
            "double_value_spec": {"min_value": 0.01, "max_value": 0.5},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_mean = {
        "parameter_spec": {
            "parameter_id": "mean",
            "discrete_value_spec": {"values": [0., 1.]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_std = {
        "parameter_spec": {
            "parameter_id": "std",
            "double_value_spec": {"min_value": 0.005, "max_value": 0.5},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    conditional_param_lr = {
        "parameter_spec": {
            "parameter_id": "lr",
            "discrete_value_spec": {"values": [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]},
            "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        },
        "parent_discrete_values": {"values": [10, 25, 50, 100]}
    }

    parameter = {
        "parameter_id": "batch_size",
        "discrete_value_spec": {"values": [10, 25, 50, 100]},
        "scale_type": aiplatform.gapic.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE,
        "conditional_parameter_specs": [
            conditional_param_H1, 
            conditional_param_H2,
            conditional_param_H3,
            conditional_param_D1,
            conditional_param_mean,
            conditional_param_std,
            conditional_param_lr,
        ],
    }

    # Trial job spec
    machine_spec = {
        "machine_type": "e2-standard-4",
    }
    worker_pool_spec = {
        "machine_spec": machine_spec,
        "replica_count": 1, 
        "python_package_spec": {
            "executor_image_uri": executor_image_uri,
            "package_uris": [package_uri],
            "python_module": python_module,
            "args": [],
        }
    }

    # hparam tuning job
    hyperparameter_tuning_job = {
        "display_name": display_name,
        "max_trial_count": 2, 
        "parallel_trial_count": 2,
        "study_spec": {
            "metrics": [metric],
            "parameters": [parameter],
        },
        "trial_job_spec": {"worker_pool_specs": [worker_pool_spec]},
    }

    parent = f"projects/{project}/locations/{location}"
    response = client.create_hyperparameter_tuning_job(
        parent=parent, hyperparameter_tuning_job=hyperparameter_tuning_job
    )
    print(f"response:", response)


if __name__ == "__main__":
    create_hyperparameter_tuning_job_python_package()

提前谢谢。

EN

回答 2

Stack Overflow用户

发布于 2021-06-11 19:38:48

似乎us-central1 1的终结点遇到了这个问题。解决方法是使用另一个端点,比如使用us-east1,这样问题就解决了。

票数 1
EN

Stack Overflow用户

发布于 2021-06-15 00:14:09

您的package_uri不正确。它应该指向包含Python包的文件(即包含所有代码的tar.bz文件),而不是目录或存储桶。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/67932401

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档