文章/答案/技术大牛

发布

问设置dask工人的寿命参数
EN

Stack Overflow用户

提问于 2020-12-21 17:38:09

回答 1查看 576关注 0票数 1

我正在使用指令这里设置dask，创建一个config.yaml来设置变量等等。由于内存泄漏，我目前正在尝试将工作人员的lifetime设置为1hr with restarts。看看value.yaml，https://github.com/dask/helm-chart/blob/master/dask/values.yaml我尝试将其设置如下：

worker:
  replicas: 8
  extraArgs:
    - "--lifetime 1hr --lifetime-restart --lifetime-stagger 5m"

然而，当我检查我的配置时，生命周期仍然没有出现：

import dask
import dask.distributed
from distributed import Client
client = Client()

dask.config.config

产出：

{'jupyter_port_80_tcp': 'tcp://10.0.72.11:80',
 'jupyter_port': 'tcp://10.0.72.11:80',
 'scheduler_port_80_tcp_proto': 'tcp',
 'jupyter_service_port': 80,
 'jupyter_service_host': '10.0.72.11',
 'jupyter_port_80_tcp_port': 80,
 'scheduler_port_80_tcp_addr': '10.0.250.48',
 'scheduler_address': 'dask-scheduler:8786',
 'scheduler_service_port_dask_webui': 80,
 'scheduler_port_8786_tcp_port': 8786,
 'scheduler_service_port': 8786,
 'scheduler_port_80_tcp': 'tcp://10.0.250.48:80',
 'jupyter_service_port_dask_jupyter': 80,
 'scheduler_port_8786_tcp': 'tcp://10.0.250.48:8786',
 'scheduler_port_80_tcp_port': 80,
 'jupyter_port_80_tcp_addr': '10.0.72.11',
 'scheduler_port': 'tcp://10.0.250.48:8786',
 'jupyter_port_80_tcp_proto': 'tcp',
 'scheduler_service_port_dask_scheduler': 8786,
 'scheduler_service_host': '10.0.250.48',
 'scheduler_port_8786_tcp_proto': 'tcp',
 'scheduler_port_8786_tcp_addr': '10.0.250.48',
 'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
 'optimization': {'fuse': {'active': True,
   'ave-width': 1,
   'max-width': None,
   'max-height': inf,
   'max-depth-new-edges': None,
   'subgraphs': None,
   'rename-keys': True}},
 'distributed': {'version': 2,
  'scheduler': {'allowed-failures': 3,
   'bandwidth': 100000000,
   'blocked-handlers': [],
   'default-data-size': '1kiB',
   'events-cleanup-delay': '1h',
   'idle-timeout': None,
   'transition-log-length': 100000,
   'work-stealing': True,
   'work-stealing-interval': '100ms',
   'worker-ttl': None,
   'pickle': True,
   'preload': [],
   'preload-argv': [],
   'unknown-task-duration': '500ms',
   'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
   'validate': False,
   'dashboard': {'status': {'task-stream-length': 1000},
    'tasks': {'task-stream-length': 100000},
    'tls': {'ca-file': None, 'key': None, 'cert': None},
    'bokeh-application': {'allow_websocket_origin': ['*'],
     'keep_alive_milliseconds': 500,
     'check_unused_sessions_milliseconds': 500}},
   'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
   'http': {'routes': ['distributed.http.scheduler.prometheus',
     'distributed.http.scheduler.info',
     'distributed.http.scheduler.json',
     'distributed.http.health',
     'distributed.http.proxy',
     'distributed.http.statics']}},
  'worker': {'blocked-handlers': [],
   'multiprocessing-method': 'spawn',
   'use-file-locking': True,
   'connections': {'outgoing': 50, 'incoming': 10},
   'preload': [],
   'preload-argv': [],
   'daemon': True,
   'validate': False,
   'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
   'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
   'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
   'http': {'routes': ['distributed.http.worker.prometheus',
     'distributed.http.health',
     'distributed.http.statics']}},
  'nanny': {'preload': [], 'preload-argv': []},
  'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
  'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
  'adaptive': {'interval': '1s',
   'target-duration': '5s',
   'minimum': 0,
   'maximum': inf,
   'wait-count': 3},
  'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
   'compression': 'auto',
   'offload': '10MiB',
   'default-scheme': 'tcp',
   'socket-backlog': 2048,
   'recent-messages-log-length': 0,
   'zstd': {'level': 3, 'threads': 0},
   'timeouts': {'connect': '10s', 'tcp': '30s'},
   'require-encryption': None,
   'tls': {'ciphers': None,
    'ca-file': None,
    'scheduler': {'cert': None, 'key': None},
    'worker': {'key': None, 'cert': None},
    'client': {'key': None, 'cert': None}}},
  'dashboard': {'link': '{scheme}://{host}:{port}/status',
   'export-tool': False,
   'graph-max-items': 5000},
  'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
   'max-error-length': 10000,
   'log-length': 10000,
   'log-format': '%(name)s - %(levelname)s - %(message)s',
   'pdb-on-err': False}},
 'rmm': {'pool-size': None},
 'ucx': {'tcp': None,
  'nvlink': None,
  'infiniband': None,
  'rdmacm': None,
  'cuda_copy': None,
  'net-devices': None,
  'reuse-endpoints': True},
 'scheduler': 'dask.distributed',
 'shuffle': 'tasks'}

此外，当我在本地尝试时，它似乎也不起作用：

import dask
import dask.distributed
from distributed import Client
client = Client(n_workers=8, lifetime="1 hour", lifetime_restart=True)

dask.config.config

产出：

{'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}, 'slicing': {'split-large-chunks': None}},
 'optimization': {'fuse': {'active': True,
   'ave-width': 1,
   'max-width': None,
   'max-height': inf,
   'max-depth-new-edges': None,
   'subgraphs': None,
   'rename-keys': True}},
 'distributed': {'version': 2,
  'scheduler': {'allowed-failures': 3,
   'bandwidth': 100000000,
   'blocked-handlers': [],
   'default-data-size': '1kiB',
   'events-cleanup-delay': '1h',
   'idle-timeout': None,
   'transition-log-length': 100000,
   'work-stealing': True,
   'work-stealing-interval': '100ms',
   'worker-ttl': None,
   'pickle': True,
   'preload': [],
   'preload-argv': [],
   'unknown-task-duration': '500ms',
   'default-task-durations': {'rechunk-split': '1us', 'shuffle-split': '1us'},
   'validate': False,
   'dashboard': {'status': {'task-stream-length': 1000},
    'tasks': {'task-stream-length': 100000},
    'tls': {'ca-file': None, 'key': None, 'cert': None},
    'bokeh-application': {'allow_websocket_origin': ['*'],
     'keep_alive_milliseconds': 500,
     'check_unused_sessions_milliseconds': 500}},
   'locks': {'lease-validation-interval': '10s', 'lease-timeout': '30s'},
   'http': {'routes': ['distributed.http.scheduler.prometheus',
     'distributed.http.scheduler.info',
     'distributed.http.scheduler.json',
     'distributed.http.health',
     'distributed.http.proxy',
     'distributed.http.statics']}},
  'worker': {'blocked-handlers': [],
   'multiprocessing-method': 'spawn',
   'use-file-locking': True,
   'connections': {'outgoing': 50, 'incoming': 10},
   'preload': [],
   'preload-argv': [],
   'daemon': True,
   'validate': False,
   'lifetime': {'duration': None, 'stagger': '0 seconds', 'restart': False},
   'profile': {'interval': '10ms', 'cycle': '1000ms', 'low-level': False},
   'memory': {'target': 0.6, 'spill': 0.7, 'pause': 0.8, 'terminate': 0.95},
   'http': {'routes': ['distributed.http.worker.prometheus',
     'distributed.http.health',
     'distributed.http.statics']}},
  'nanny': {'preload': [], 'preload-argv': []},
  'client': {'heartbeat': '5s', 'scheduler-info-interval': '2s'},
  'deploy': {'lost-worker-timeout': '15s', 'cluster-repr-interval': '500ms'},
  'adaptive': {'interval': '1s',
   'target-duration': '5s',
   'minimum': 0,
   'maximum': inf,
   'wait-count': 3},
  'comm': {'retry': {'count': 0, 'delay': {'min': '1s', 'max': '20s'}},
   'compression': 'auto',
   'offload': '10MiB',
   'default-scheme': 'tcp',
   'socket-backlog': 2048,
   'recent-messages-log-length': 0,
   'zstd': {'level': 3, 'threads': 0},
   'timeouts': {'connect': '10s', 'tcp': '30s'},
   'require-encryption': None,
   'tls': {'ciphers': None,
    'ca-file': None,
    'scheduler': {'cert': None, 'key': None},
    'worker': {'key': None, 'cert': None},
    'client': {'key': None, 'cert': None}}},
  'dashboard': {'link': '{scheme}://{host}:{port}/status',
   'export-tool': False,
   'graph-max-items': 5000},
  'admin': {'tick': {'interval': '20ms', 'limit': '3s'},
   'max-error-length': 10000,
   'log-length': 10000,
   'log-format': '%(name)s - %(levelname)s - %(message)s',
   'pdb-on-err': False}},
 'rmm': {'pool-size': None},
 'ucx': {'tcp': None,
  'nvlink': None,
  'infiniband': None,
  'rdmacm': None,
  'cuda_copy': None,
  'net-devices': None,
  'reuse-endpoints': True},
 'scheduler': 'dask.distributed',
 'shuffle': 'tasks'}

有什么建议吗？

python

dask

回答 1

Stack Overflow用户

发布于 2022-02-21 08:14:26

在类定义中，lifetime_restart参数必须设置为bool，所以我认为它是：--lifetime_restart True

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/65397449

复制

相似问题

问设置dask工人的寿命参数
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问设置dask工人的寿命参数EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问设置dask工人的寿命参数
EN