文章/答案/技术大牛

发布

社区首页 >问答首页 >我在木星笔记本上运行了一个python代码，发现了这个错误。参数传递误差

问我在木星笔记本上运行了一个python代码，发现了这个错误。参数传递误差
EN

Stack Overflow用户

提问于 2022-04-25 10:30:24

回答 1查看 157关注 0票数 -3

我在木星笔记本中运行下面的python代码并获得错误，请建议我如何解决这个问题。

run codegen_sources/preprocessing/new_preprocess.py data/newtest_dataset obfuscation

我的python文件preprocessing.py包括以下代码

请建议我用正确的语法来传递这些参数。我也试过这个

%run codegen_sources/preprocessing/preprocess data/test_dataset 20  python obfuscation 8 500 200 400 roberta False data/bpe/cpp-java-python/vocab False data/bpe/cpp-java-python --1

下面是preprocessing.py的源代码

from pathlib import Path

import argparse
from submitit import AutoExecutor, LocalExecutor

from codegen_sources.preprocessing.bpe_modes.fast_bpe_mode import FastBPEMode
from codegen_sources.preprocessing.bpe_modes.roberta_bpe_mode import RobertaBPEMode
from codegen_sources.preprocessing.dataset_modes.monolingual_functions_mode import (
    MonolingualFunctionsMode,
)

from codegen_sources.preprocessing.dataset_modes.monolingual_mode import MonolingualMode
from codegen_sources.preprocessing.dataset_modes.obfuscation_mode import ObfuscationMode
from codegen_sources.preprocessing.dataset_modes.obfuscation_functions_mode import (
    ObfuscationFunctionsMode,
)


from codegen_sources.model.src.logger import create_logger
import logging
import multiprocessing
import os

from codegen_sources.preprocessing.utils import bool_flag

def preprocess(args):

create_logger(filepath=None, rank=0)
logger = logging.getLogger()
logger.info(f"Dataset pipeline for {args.input_path}")
# dataset mode
dataset_class = {
    "obfuscation": ObfuscationMode,
    "monolingual": MonolingualMode,
    "monolingual_functions": MonolingualFunctionsMode,
    "obfuscation_functions": ObfuscationFunctionsMode,
}
dataset_mode = dataset_class[args.mode]

# bpe mode
assert args.bpe_mode in ["fast", "roberta"]
if args.bpe_mode == "fast":
    BPE_mode = FastBPEMode(
        vocab_path=args.fastbpe_vocab_path,
        codes=args.fastbpe_code_path,
        use_vocab=args.fastbpe_use_vocab,
    )
else:
    BPE_mode = RobertaBPEMode()

if args.local is False:
    cluster_tokenization = AutoExecutor(Path(args.input_path).joinpath("log"))
    cluster_tokenization.update_parameters(
        cpus_per_task=40,
        mem_gb=args.job_mem,
        slurm_partition="learnlab",
        array_parallelism=200,
    )
    cluster_train_bpe = AutoExecutor(Path(args.input_path).joinpath("log"))
    cluster_train_bpe.update_parameters(
        cpus_per_task=1, mem_gb=args.job_mem, slurm_partition="learnlab",
    )
    cluster_apply_bpe = AutoExecutor(Path(args.input_path).joinpath("log"))
    cluster_apply_bpe.update_parameters(
        cpus_per_task=1,
        mem_gb=args.job_mem,
        slurm_partition="learnlab",
        array_parallelism=200,
    )
else:
    cluster_tokenization = LocalExecutor(Path(args.input_path).joinpath("log"))
    cluster_train_bpe = LocalExecutor(Path(args.input_path).joinpath("log"))
    cluster_apply_bpe = LocalExecutor(Path(args.input_path).joinpath("log"))
cluster_tokenization.update_parameters(timeout_min=args.tokenization_timeout)
cluster_train_bpe.update_parameters(timeout_min=args.train_bpe_timeout)
cluster_apply_bpe.update_parameters(timeout_min=args.bpe_timeout)

dataset = dataset_mode(
    folder=args.input_path,
    languages=args.langs,
    bpe=BPE_mode,
    nb_train_split=args.train_splits,
    keep_comments=args.keep_comments,
)
dataset.extract_data_and_tokenize(
    executor=cluster_tokenization, local_parallelism=args.local_parallelism
)

dataset.get_train_test_valid_splits(
    percent_test=args.percent_test_valid,
    percent_valid=args.percent_test_valid,
    dedupe=True,
)
dataset.learn_bpe(ncodes=args.ncodes, executor=cluster_train_bpe)

dataset.apply_bpe(
    executor=cluster_apply_bpe, local_parallelism=args.local_parallelism
)
dataset.get_vocab(executor=cluster_train_bpe)
dataset.binarize(
    executor=cluster_apply_bpe, local_parallelism=args.local_parallelism
)
dataset.check_files_and_symlink_for_XLM()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("input_path", help="root folder")
    parser.add_argument(
        "--local",
        type=bool_flag,
        default=True,
        help="True if you want to run the processing pipeline locally, false if want to use submitit.",
    )
    parser.add_argument(
        "--local_parallelism",
        type=int,
        default=None,
        help="When running locally, number of files read at the same time.",
    )
    parser.add_argument(
        "--langs",
        nargs="+",
        default=["python", "java", "cpp"],
        help="list of languages to run on",
    )
    parser.add_argument(
        "--mode",
        type=str,
        default="monolingual_functions",
        choices=[
            "obfuscation",
            "monolingual",
            "monolingual_functions",
            "obfuscation_functions",
        ],
        help="Type of dataset.",
    )
    parser.add_argument(
        "--train_splits", type=int, default=8, help="Number of train splits."
    )
    parser.add_argument(
        "--job_mem",
        type=int,
        default=250,
        help="Memory in GB for jobs run on the cluster",
    )
    parser.add_argument(
        "--tokenization_timeout",
        type=int,
        default=500,
        help="Timeout for tokenization/obfuscation jobs",
    )
    parser.add_argument(
        "--bpe_timeout", type=int, default=240, help="Timeout for bpe jobs"
    )
    parser.add_argument(
        "--train_bpe_timeout", type=int, default=500, help="Timeout for bpe jobs"
    )
    parser.add_argument(
        "--bpe_mode",
        type=str,
        default="fast",
        choices=["fast", "roberta"],
        help="Type of BPE, should be roberta or fast.",
    )
    parser.add_argument(
        "--fastbpe_use_vocab",
        type=bool_flag,
        default=False,
        help="Whether to use the vocab when applying BPE",
    )
    parser.add_argument(
        "--fastbpe_vocab_path",
        type=str,
        default=None,
        help="Path to existing fastbpe vocab",
    )
    parser.add_argument(
        "--keep_comments",
        type=bool_flag,
        default=False,
        help="Whether to keep the comments (does not happen with deobfuscation dataset).",
    )
    parser.add_argument(
        "--fastbpe_code_path",
        type=str,
        default=None,
        help="Path to existing fastbpe codes",
    )
    parser.add_argument(
        "--ncodes",
        type=int,
        default=50000,
        help="Number of codes to be learnt with fast bpe if no bpe codes is given.",
    )
    parser.add_argument(
        "--percent_test_valid",
        type=int,
        default=1,
        help="Percentage of data that will be put into test and valid sets.",
    )
    args = parser.parse_args()
    args.input_path = os.path.abspath(args.input_path)
    multiprocessing.set_start_method("fork")
    preprocess(args)

在传递有效的参数时，我会得到以下错误

run codegen_sources/preprocessing/new_preprocess.py data/newtest_dataset obfuscation



usage: new_preprocess.py [-h] [--local LOCAL]
                     [--local_parallelism LOCAL_PARALLELISM]
                     [--langs LANGS [LANGS ...]]
                     [--mode {obfuscation,monolingual,monolingual_functions,obfuscation_functions}]
                     [--train_splits TRAIN_SPLITS] [--job_mem JOB_MEM]
                     [--tokenization_timeout TOKENIZATION_TIMEOUT]
                     [--bpe_timeout BPE_TIMEOUT]
                     [--train_bpe_timeout TRAIN_BPE_TIMEOUT]
                     [--bpe_mode {fast,roberta}]
                     [--fastbpe_use_vocab FASTBPE_USE_VOCAB]
                     [--fastbpe_vocab_path FASTBPE_VOCAB_PATH]
                     [--keep_comments KEEP_COMMENTS]
                     [--fastbpe_code_path FASTBPE_CODE_PATH]
                     [--ncodes NCODES]
                     [--percent_test_valid PERCENT_TEST_VALID]
                     input_path
new_preprocess.py: error: unrecognized arguments: --obfuscation
An exception has occurred, use %tb to see the full traceback.

SystemExit: 2

jupyter

python

machine-learning

deep-learning

nlp

回答 1

Stack Overflow用户

发布于 2022-04-25 10:36:52

错误消息告诉您，您正在传递一个参数--obfuscation，该参数是不可识别的。我相信您希望将参数--mode obfuscation传递给解析器。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/71998108

复制

相似问题

问我在木星笔记本上运行了一个python代码，发现了这个错误。参数传递误差
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问我在木星笔记本上运行了一个python代码，发现了这个错误。参数传递误差EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问我在木星笔记本上运行了一个python代码，发现了这个错误。参数传递误差
EN