|
|
|
@@ -10,6 +10,7 @@ import ( |
|
|
|
"os" |
|
|
|
"path" |
|
|
|
"regexp" |
|
|
|
"strconv" |
|
|
|
"strings" |
|
|
|
|
|
|
|
"code.gitea.io/gitea/modules/timeutil" |
|
|
|
@@ -39,6 +40,277 @@ import ( |
|
|
|
|
|
|
|
var jobNamePattern = regexp.MustCompile(`^[a-z0-9][a-z0-9-_]{1,34}[a-z0-9-]$`) |
|
|
|
|
|
|
|
const TaskTypeCloudbrainOne = 0 |
|
|
|
const TaskTypeModelArts = 1 |
|
|
|
const TaskTypeGrampusGPU = 2 |
|
|
|
const TaskTypeGrampusNPU = 3 |
|
|
|
|
|
|
|
func CloudbrainOneTrainJobCreate(ctx *context.Context, option api.CreateTrainJobOption) { |
|
|
|
|
|
|
|
displayJobName := option.DisplayJobName |
|
|
|
jobName := util.ConvertDisplayJobNameToJobName(displayJobName) |
|
|
|
image := strings.TrimSpace(option.Image) |
|
|
|
uuids := option.Attachment |
|
|
|
jobType := string(models.JobTypeTrain) |
|
|
|
|
|
|
|
codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath |
|
|
|
branchName := option.BranchName |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
|
|
|
|
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), jobType, displayJobName)) |
|
|
|
defer lock.UnLock() |
|
|
|
spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo) |
|
|
|
if err != nil { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
command, err := getTrainJobCommand(option) |
|
|
|
if err != nil { |
|
|
|
log.Error("getTrainJobCommand failed: %v", err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) |
|
|
|
if errStr != "" { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr(errStr))) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) |
|
|
|
|
|
|
|
req := cloudbrain.GenerateCloudBrainTaskReq{ |
|
|
|
Ctx: ctx, |
|
|
|
DisplayJobName: displayJobName, |
|
|
|
JobName: jobName, |
|
|
|
Image: image, |
|
|
|
Command: command, |
|
|
|
Uuids: uuids, |
|
|
|
DatasetNames: datasetNames, |
|
|
|
DatasetInfos: datasetInfos, |
|
|
|
CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), |
|
|
|
ModelPath: storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), |
|
|
|
BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), |
|
|
|
Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), |
|
|
|
BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), |
|
|
|
JobType: jobType, |
|
|
|
Description: option.Description, |
|
|
|
BranchName: branchName, |
|
|
|
BootFile: option.BootFile, |
|
|
|
Params: option.Params, |
|
|
|
CommitID: commitID, |
|
|
|
BenchmarkTypeID: 0, |
|
|
|
BenchmarkChildTypeID: 0, |
|
|
|
ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), |
|
|
|
Spec: spec, |
|
|
|
} |
|
|
|
|
|
|
|
if option.ModelName != "" { //使用预训练模型训练 |
|
|
|
req.ModelName = option.ModelName |
|
|
|
req.LabelName = option.LabelName |
|
|
|
req.CkptName = option.CkptName |
|
|
|
req.ModelVersion = option.ModelVersion |
|
|
|
req.PreTrainModelPath = setting.Attachment.Minio.RealPath + option.PreTrainModelUrl |
|
|
|
req.PreTrainModelUrl = option.PreTrainModelUrl |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
jobId, err := cloudbrain.GenerateTask(req) |
|
|
|
if err != nil { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.JSON(http.StatusOK, models.BaseMessageApi{ |
|
|
|
Code: 0, |
|
|
|
Message: jobId, |
|
|
|
}) |
|
|
|
} |
|
|
|
func ModelArtsTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOption) { |
|
|
|
VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) |
|
|
|
displayJobName := option.DisplayJobName |
|
|
|
jobName := util.ConvertDisplayJobNameToJobName(displayJobName) |
|
|
|
uuid := option.Attachment |
|
|
|
description := option.Description |
|
|
|
workServerNumber := option.WorkServerNumber |
|
|
|
engineID, _ := strconv.Atoi(option.ImageID) |
|
|
|
bootFile := strings.TrimSpace(option.BootFile) |
|
|
|
params := option.Params |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath |
|
|
|
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/" |
|
|
|
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" |
|
|
|
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" |
|
|
|
branchName := option.BranchName |
|
|
|
isLatestVersion := modelarts.IsLatestVersion |
|
|
|
VersionCount := modelarts.VersionCountOne |
|
|
|
EngineName := option.Image |
|
|
|
|
|
|
|
errStr := checkMultiNode(ctx.User.ID, option.WorkServerNumber) |
|
|
|
if errStr != "" { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr(errStr))) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) |
|
|
|
defer lock.UnLock() |
|
|
|
|
|
|
|
spec, _, _, err := checkParameters(ctx, option, lock, repo) |
|
|
|
if err != nil { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//todo: del the codeLocalPath |
|
|
|
_, err = ioutil.ReadDir(codeLocalPath) |
|
|
|
if err == nil { |
|
|
|
os.RemoveAll(codeLocalPath) |
|
|
|
} |
|
|
|
|
|
|
|
gitRepo, _ := git.OpenRepository(repo.RepoPath()) |
|
|
|
commitID, _ := gitRepo.GetBranchCommitID(branchName) |
|
|
|
|
|
|
|
if err := downloadCode(repo, codeLocalPath, branchName); err != nil { |
|
|
|
log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed"))) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//todo: upload code (send to file_server todo this work?) |
|
|
|
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { |
|
|
|
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Failed to obsMkdir_output")) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { |
|
|
|
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Failed to obsMkdir_log")) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
parentDir := VersionOutputPath + "/" |
|
|
|
if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { |
|
|
|
// if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed"))) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
var parameters models.Parameters |
|
|
|
param := make([]models.Parameter, 0) |
|
|
|
existDeviceTarget := false |
|
|
|
if len(params) != 0 { |
|
|
|
err := json.Unmarshal([]byte(params), ¶meters) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to Unmarshal params: %s (%v)", params, err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("运行参数错误")) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
for _, parameter := range parameters.Parameter { |
|
|
|
if parameter.Label == modelarts.DeviceTarget { |
|
|
|
existDeviceTarget = true |
|
|
|
} |
|
|
|
if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { |
|
|
|
param = append(param, models.Parameter{ |
|
|
|
Label: parameter.Label, |
|
|
|
Value: parameter.Value, |
|
|
|
}) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if !existDeviceTarget { |
|
|
|
param = append(param, models.Parameter{ |
|
|
|
Label: modelarts.DeviceTarget, |
|
|
|
Value: modelarts.Ascend, |
|
|
|
}) |
|
|
|
} |
|
|
|
datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to getDatasUrlListByUUIDS: %v", err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Failed to getDatasUrlListByUUIDS:"+err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
dataPath := dataUrl |
|
|
|
jsondatas, err := json.Marshal(datasUrlList) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to Marshal: %v", err) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("json error:"+err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
if isMultiDataset { |
|
|
|
param = append(param, models.Parameter{ |
|
|
|
Label: modelarts.MultiDataUrl, |
|
|
|
Value: string(jsondatas), |
|
|
|
}) |
|
|
|
} |
|
|
|
if option.ModelName != "" { //使用预训练模型训练 |
|
|
|
ckptUrl := "/" + option.PreTrainModelUrl + option.CkptName |
|
|
|
param = append(param, models.Parameter{ |
|
|
|
Label: modelarts.CkptUrl, |
|
|
|
Value: "s3:/" + ckptUrl, |
|
|
|
}) |
|
|
|
} |
|
|
|
|
|
|
|
req := &modelarts.GenerateTrainJobReq{ |
|
|
|
JobName: jobName, |
|
|
|
DisplayJobName: displayJobName, |
|
|
|
DataUrl: dataPath, |
|
|
|
Description: description, |
|
|
|
CodeObsPath: codeObsPath, |
|
|
|
BootFileUrl: codeObsPath + bootFile, |
|
|
|
BootFile: bootFile, |
|
|
|
TrainUrl: outputObsPath, |
|
|
|
WorkServerNumber: workServerNumber, |
|
|
|
EngineID: int64(engineID), |
|
|
|
LogUrl: logObsPath, |
|
|
|
PoolID: getPoolId(), |
|
|
|
Uuid: uuid, |
|
|
|
Parameters: param, |
|
|
|
CommitID: commitID, |
|
|
|
IsLatestVersion: isLatestVersion, |
|
|
|
BranchName: branchName, |
|
|
|
Params: option.Params, |
|
|
|
EngineName: EngineName, |
|
|
|
VersionCount: VersionCount, |
|
|
|
TotalVersionCount: modelarts.TotalVersionCount, |
|
|
|
DatasetName: datasetNames, |
|
|
|
Spec: spec, |
|
|
|
} |
|
|
|
if option.ModelName != "" { //使用预训练模型训练 |
|
|
|
req.ModelName = option.ModelName |
|
|
|
req.LabelName = option.LabelName |
|
|
|
req.CkptName = option.CkptName |
|
|
|
req.ModelVersion = option.ModelVersion |
|
|
|
req.PreTrainModelUrl = option.PreTrainModelUrl |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
userCommand, userImageUrl := getUserCommand(engineID, req) |
|
|
|
req.UserCommand = userCommand |
|
|
|
req.UserImageUrl = userImageUrl |
|
|
|
|
|
|
|
//将params转换Parameters.Parameter,出错时返回给前端 |
|
|
|
var Parameters modelarts.Parameters |
|
|
|
if err := json.Unmarshal([]byte(params), &Parameters); err != nil { |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("json.Unmarshal failed:"+err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
jobId, err := modelarts.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error()) |
|
|
|
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error())) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.JSON(http.StatusOK, models.BaseMessageApi{ |
|
|
|
Code: 0, |
|
|
|
Message: jobId, |
|
|
|
}) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOption) { |
|
|
|
|
|
|
|
displayJobName := option.DisplayJobName |
|
|
|
@@ -179,13 +451,19 @@ func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock |
|
|
|
} |
|
|
|
|
|
|
|
computeResource := models.GPUResource |
|
|
|
if option.Type == 3 { |
|
|
|
if isNpuTask(option) { |
|
|
|
computeResource = models.NPUResource |
|
|
|
} |
|
|
|
|
|
|
|
//check count limit |
|
|
|
count, err := GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), computeResource) |
|
|
|
taskType := option.Type |
|
|
|
if isC2NetTask(option) { |
|
|
|
taskType = 2 |
|
|
|
} |
|
|
|
|
|
|
|
count, err := GetNotFinalStatusTaskCount(ctx.User.ID, taskType, string(models.JobTypeTrain), computeResource) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
log.Error("GetCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
return nil, nil, "", fmt.Errorf("system error") |
|
|
|
} else { |
|
|
|
if count >= 1 { |
|
|
|
@@ -195,7 +473,7 @@ func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock |
|
|
|
} |
|
|
|
|
|
|
|
//check param |
|
|
|
if err := grampusParamCheckCreateTrainJob(option.BootFile, option.BranchName); err != nil { |
|
|
|
if err := paramCheckCreateTrainJob(option.BootFile, option.BranchName); err != nil { |
|
|
|
log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"]) |
|
|
|
return nil, nil, "", err |
|
|
|
} |
|
|
|
@@ -216,13 +494,26 @@ func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock |
|
|
|
|
|
|
|
//check specification |
|
|
|
computeType := models.GPU |
|
|
|
if option.Type == 3 { |
|
|
|
|
|
|
|
if isNpuTask(option) { |
|
|
|
computeType = models.NPU |
|
|
|
} |
|
|
|
cluster := models.OpenICluster |
|
|
|
if isC2NetTask(option) { |
|
|
|
cluster = models.C2NetCluster |
|
|
|
} |
|
|
|
aiCenterCode := "" |
|
|
|
if option.Type == TaskTypeCloudbrainOne { |
|
|
|
aiCenterCode = models.AICenterOfCloudBrainOne |
|
|
|
} else if option.Type == TaskTypeModelArts { |
|
|
|
aiCenterCode = models.AICenterOfCloudBrainTwo |
|
|
|
} |
|
|
|
|
|
|
|
spec, err := resource.GetAndCheckSpec(ctx.User.ID, option.SpecId, models.FindSpecsOptions{ |
|
|
|
JobType: models.JobTypeTrain, |
|
|
|
ComputeResource: computeType, |
|
|
|
Cluster: models.C2NetCluster, |
|
|
|
Cluster: cluster, |
|
|
|
AiCenterCode: aiCenterCode, |
|
|
|
}) |
|
|
|
if err != nil || spec == nil { |
|
|
|
return nil, nil, "", fmt.Errorf("Resource specification is not available.") |
|
|
|
@@ -234,14 +525,26 @@ func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock |
|
|
|
} |
|
|
|
|
|
|
|
//check dataset |
|
|
|
datasetInfos, datasetNames, err := models.GetDatasetInfo(option.Attachment, computeType) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
return nil, nil, "", fmt.Errorf(ctx.Tr("cloudbrain.error.dataset_select")) |
|
|
|
var datasetInfos map[string]models.DatasetInfo |
|
|
|
var datasetNames string |
|
|
|
if option.Type != TaskTypeModelArts { |
|
|
|
datasetInfos, datasetNames, err = models.GetDatasetInfo(option.Attachment, computeType) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
return nil, nil, "", fmt.Errorf(ctx.Tr("cloudbrain.error.dataset_select")) |
|
|
|
} |
|
|
|
} |
|
|
|
return spec, datasetInfos, datasetNames, err |
|
|
|
} |
|
|
|
|
|
|
|
func isNpuTask(option api.CreateTrainJobOption) bool { |
|
|
|
return option.Type == TaskTypeModelArts || option.Type == TaskTypeGrampusNPU |
|
|
|
} |
|
|
|
|
|
|
|
func isC2NetTask(option api.CreateTrainJobOption) bool { |
|
|
|
return option.Type == TaskTypeGrampusGPU || option.Type == TaskTypeGrampusNPU |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOption) { |
|
|
|
|
|
|
|
displayJobName := option.DisplayJobName |
|
|
|
@@ -412,7 +715,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func grampusParamCheckCreateTrainJob(bootFile string, branchName string) error { |
|
|
|
func paramCheckCreateTrainJob(bootFile string, branchName string) error { |
|
|
|
if !strings.HasSuffix(strings.TrimSpace(bootFile), ".py") { |
|
|
|
log.Error("the boot file(%s) must be a python file", bootFile) |
|
|
|
return errors.New("启动文件必须是python文件") |
|
|
|
@@ -792,3 +1095,99 @@ func SyncTaskStatus(task *models.Cloudbrain) error { |
|
|
|
return nil |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func getTrainJobCommand(option api.CreateTrainJobOption) (string, error) { |
|
|
|
var command string |
|
|
|
bootFile := strings.TrimSpace(option.BootFile) |
|
|
|
params := option.Params |
|
|
|
|
|
|
|
if !strings.HasSuffix(bootFile, ".py") { |
|
|
|
log.Error("bootFile(%s) format error", bootFile) |
|
|
|
return command, errors.New("bootFile format error") |
|
|
|
} |
|
|
|
|
|
|
|
var parameters models.Parameters |
|
|
|
var param string |
|
|
|
if len(params) != 0 { |
|
|
|
err := json.Unmarshal([]byte(params), ¶meters) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to Unmarshal params: %s (%v)", params, err) |
|
|
|
return command, err |
|
|
|
} |
|
|
|
|
|
|
|
for _, parameter := range parameters.Parameter { |
|
|
|
param += " --" + parameter.Label + "=" + parameter.Value |
|
|
|
} |
|
|
|
} |
|
|
|
if option.CkptName != "" { |
|
|
|
param += " --ckpt_url" + "=" + "/pretrainmodel/" + option.CkptName |
|
|
|
} |
|
|
|
|
|
|
|
command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + option.DisplayJobName + "-" + cloudbrain.LogFile |
|
|
|
|
|
|
|
return command, nil |
|
|
|
} |
|
|
|
|
|
|
|
func checkMultiNode(userId int64, serverNum int) string { |
|
|
|
if serverNum == 1 { |
|
|
|
return "" |
|
|
|
} |
|
|
|
modelarts.InitMultiNode() |
|
|
|
var isServerNumValid = false |
|
|
|
if modelarts.MultiNodeConfig != nil { |
|
|
|
for _, info := range modelarts.MultiNodeConfig.Info { |
|
|
|
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { |
|
|
|
if isInNodes(info.Node, serverNum) { |
|
|
|
isServerNumValid = true |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
if isServerNumValid { |
|
|
|
return "" |
|
|
|
} else { |
|
|
|
return "repo.modelarts.no_node_right" |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func isInNodes(nodes []int, num int) bool { |
|
|
|
for _, node := range nodes { |
|
|
|
if node == num { |
|
|
|
return true |
|
|
|
} |
|
|
|
} |
|
|
|
return false |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { |
|
|
|
userImageUrl := "" |
|
|
|
userCommand := "" |
|
|
|
if engineId < 0 { |
|
|
|
tmpCodeObsPath := strings.Trim(req.CodeObsPath, "/") |
|
|
|
tmpCodeObsPaths := strings.Split(tmpCodeObsPath, "/") |
|
|
|
lastCodeDir := "code" |
|
|
|
if len(tmpCodeObsPaths) > 0 { |
|
|
|
lastCodeDir = tmpCodeObsPaths[len(tmpCodeObsPaths)-1] |
|
|
|
} |
|
|
|
userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' '" + lastCodeDir + "/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'" |
|
|
|
var versionInfos modelarts.VersionInfo |
|
|
|
if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { |
|
|
|
log.Info("json parse err." + err.Error()) |
|
|
|
} else { |
|
|
|
for _, engine := range versionInfos.Version { |
|
|
|
if engine.ID == engineId { |
|
|
|
userImageUrl = engine.Url |
|
|
|
break |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
for _, param := range req.Parameters { |
|
|
|
userCommand += " --'" + param.Label + "'='" + param.Value + "'" |
|
|
|
} |
|
|
|
return userCommand, userImageUrl |
|
|
|
} |
|
|
|
return userCommand, userImageUrl |
|
|
|
} |