我正在使用指令这里设置dask,创建一个config.yaml来设置变量等等。由于内存泄漏,我目前正在尝试将工作人员的lifetime设置为1hr with restarts。看看value.yaml,https://github.com/dask/helm-chart/blob/master/dask/values.yaml我尝试将其设置如下:
worker:
replicas: 8
extraArgs:
- "--lifetime 1hr --lifetime-restart --lifetime-stagger 5m"然而,当我检查我的配置时,生命周期仍然没有出现:
import dask
import dask.distributed
from distributed import Client
client = Client()
dask.config.config产出:
{'jupyter_port_80_tcp': 'tcp://10.0.72.11:80',
'jupyter_port': 'tcp://10.0.72.11:80',
'scheduler_port_80_tcp_proto': 'tcp',
'jupyter_service_port': 80,
'jupyter_service_host': '10.0.72.11',
'jupyter_port_80_tcp_port': 80,
'scheduler_port_80_tcp_addr': '10.0.250.48',
'scheduler_address': 'dask-scheduler:8786',
'scheduler_service_port_dask_webui': 80,
'scheduler_port_8786_tcp_port': 8786,
'scheduler_service_port': 8786,
'scheduler_port_80_tcp': 'tcp://10.0.250.48:80',
'jupyter_service_port_dask_jupyter': 80,
'scheduler_port_8786_tcp': 'tcp://10.0.250.48:8786',
'scheduler_port_80_tcp_port': 80,
'jupyter_port_80_tcp_addr': '10.0.72.11',
'scheduler_port': 'tcp://10.0.250.48:8786',
'jupyter_port_80_tcp_proto': 'tcp',
'scheduler_service_port_dask_scheduler': 8786,
'scheduler_service_host': '10.0.250.48',
'scheduler_port_8786_tcp_proto': 'tcp',
'scheduler_port_8786_tcp_addr': '10.0.250.48',
'temporary-directory': None,
'dataframe': {'shuffle-compression': None},
'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
'optimization': {'fuse': {'active': True,
'ave-width': 1,
'max-width': None,
'max-height': inf,
'max-depth-new-edges': None,
'subgraphs': None,
'rename-keys': True}},
'distributed': {'version': 2,
'scheduler': {'allowed-failures': 3,
'bandwidth': 100000000,
'blocked-handlers': [],
'default-data-size': '1kiB',
'events-cleanup-delay': '1h',
'idle-timeout': None,
'transition-log-length': 100000,
'work-stealing': True,
'work-stealing-interval': '100ms',
'worker-ttl': None,
'pickle': True,
'preload': [],
'preload-argv': [],
'unknown-task-duration': '500ms',
'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
'validate': False,
'dashboard': {'status': {'task-stream-length': 1000},
'tasks': {'task-stream-length': 100000},
'tls': {'ca-file': None, 'key': None, 'cert': None},
'bokeh-application': {'allow_websocket_origin': ['*'],
'keep_alive_milliseconds': 500,
'check_unused_sessions_milliseconds': 500}},
'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
'http': {'routes': ['distributed.http.scheduler.prometheus',
'distributed.http.scheduler.info',
'distributed.http.scheduler.json',
'distributed.http.health',
'distributed.http.proxy',
'distributed.http.statics']}},
'worker': {'blocked-handlers': [],
'multiprocessing-method': 'spawn',
'use-file-locking': True,
'connections': {'outgoing': 50, 'incoming': 10},
'preload': [],
'preload-argv': [],
'daemon': True,
'validate': False,
'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
'http': {'routes': ['distributed.http.worker.prometheus',
'distributed.http.health',
'distributed.http.statics']}},
'nanny': {'preload': [], 'preload-argv': []},
'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
'adaptive': {'interval': '1s',
'target-duration': '5s',
'minimum': 0,
'maximum': inf,
'wait-count': 3},
'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
'compression': 'auto',
'offload': '10MiB',
'default-scheme': 'tcp',
'socket-backlog': 2048,
'recent-messages-log-length': 0,
'zstd': {'level': 3, 'threads': 0},
'timeouts': {'connect': '10s', 'tcp': '30s'},
'require-encryption': None,
'tls': {'ciphers': None,
'ca-file': None,
'scheduler': {'cert': None, 'key': None},
'worker': {'key': None, 'cert': None},
'client': {'key': None, 'cert': None}}},
'dashboard': {'link': '{scheme}://{host}:{port}/status',
'export-tool': False,
'graph-max-items': 5000},
'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
'max-error-length': 10000,
'log-length': 10000,
'log-format': '%(name)s - %(levelname)s - %(message)s',
'pdb-on-err': False}},
'rmm': {'pool-size': None},
'ucx': {'tcp': None,
'nvlink': None,
'infiniband': None,
'rdmacm': None,
'cuda_copy': None,
'net-devices': None,
'reuse-endpoints': True},
'scheduler': 'dask.distributed',
'shuffle': 'tasks'}此外,当我在本地尝试时,它似乎也不起作用:
import dask
import dask.distributed
from distributed import Client
client = Client(n_workers=8, lifetime="1 hour", lifetime_restart=True)
dask.config.config产出:
{'temporary-directory': None,
'dataframe': {'shuffle-compression': None},
'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
'optimization': {'fuse': {'active': True,
'ave-width': 1,
'max-width': None,
'max-height': inf,
'max-depth-new-edges': None,
'subgraphs': None,
'rename-keys': True}},
'distributed': {'version': 2,
'scheduler': {'allowed-failures': 3,
'bandwidth': 100000000,
'blocked-handlers': [],
'default-data-size': '1kiB',
'events-cleanup-delay': '1h',
'idle-timeout': None,
'transition-log-length': 100000,
'work-stealing': True,
'work-stealing-interval': '100ms',
'worker-ttl': None,
'pickle': True,
'preload': [],
'preload-argv': [],
'unknown-task-duration': '500ms',
'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
'validate': False,
'dashboard': {'status': {'task-stream-length': 1000},
'tasks': {'task-stream-length': 100000},
'tls': {'ca-file': None, 'key': None, 'cert': None},
'bokeh-application': {'allow_websocket_origin': ['*'],
'keep_alive_milliseconds': 500,
'check_unused_sessions_milliseconds': 500}},
'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
'http': {'routes': ['distributed.http.scheduler.prometheus',
'distributed.http.scheduler.info',
'distributed.http.scheduler.json',
'distributed.http.health',
'distributed.http.proxy',
'distributed.http.statics']}},
'worker': {'blocked-handlers': [],
'multiprocessing-method': 'spawn',
'use-file-locking': True,
'connections': {'outgoing': 50, 'incoming': 10},
'preload': [],
'preload-argv': [],
'daemon': True,
'validate': False,
'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
'http': {'routes': ['distributed.http.worker.prometheus',
'distributed.http.health',
'distributed.http.statics']}},
'nanny': {'preload': [], 'preload-argv': []},
'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
'adaptive': {'interval': '1s',
'target-duration': '5s',
'minimum': 0,
'maximum': inf,
'wait-count': 3},
'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
'compression': 'auto',
'offload': '10MiB',
'default-scheme': 'tcp',
'socket-backlog': 2048,
'recent-messages-log-length': 0,
'zstd': {'level': 3, 'threads': 0},
'timeouts': {'connect': '10s', 'tcp': '30s'},
'require-encryption': None,
'tls': {'ciphers': None,
'ca-file': None,
'scheduler': {'cert': None, 'key': None},
'worker': {'key': None, 'cert': None},
'client': {'key': None, 'cert': None}}},
'dashboard': {'link': '{scheme}://{host}:{port}/status',
'export-tool': False,
'graph-max-items': 5000},
'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
'max-error-length': 10000,
'log-length': 10000,
'log-format': '%(name)s - %(levelname)s - %(message)s',
'pdb-on-err': False}},
'rmm': {'pool-size': None},
'ucx': {'tcp': None,
'nvlink': None,
'infiniband': None,
'rdmacm': None,
'cuda_copy': None,
'net-devices': None,
'reuse-endpoints': True},
'scheduler': 'dask.distributed',
'shuffle': 'tasks'}有什么建议吗?
发布于 2022-02-21 08:14:26
在类定义中,lifetime_restart参数必须设置为bool,所以我认为它是:--lifetime_restart True
https://stackoverflow.com/questions/65397449
复制相似问题