| @@ -13,6 +13,8 @@ import ( | |||||
| "strconv" | "strconv" | ||||
| "strings" | "strings" | ||||
| "code.gitea.io/gitea/modules/urfs_client/urchin" | |||||
| "code.gitea.io/gitea/modules/timeutil" | "code.gitea.io/gitea/modules/timeutil" | ||||
| "code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
| @@ -382,7 +384,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt | |||||
| //prepare command | //prepare command | ||||
| preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName) | preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName) | ||||
| command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, option.CkptName) | |||||
| command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, option.CkptName, "") | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) | log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) | ||||
| ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error")) | ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error")) | ||||
| @@ -611,7 +613,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt | |||||
| //prepare command | //prepare command | ||||
| preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName) | preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName) | ||||
| command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, option.CkptName) | |||||
| command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, option.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName)) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) | log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) | ||||
| @@ -862,15 +864,18 @@ func getPreTrainModelPath(pretrainModelDir string, fileName string) string { | |||||
| } | } | ||||
| func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) { | |||||
| func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) { | |||||
| var command string | var command string | ||||
| //prepare | |||||
| workDir := grampus.NpuWorkDir | workDir := grampus.NpuWorkDir | ||||
| if processorType == grampus.ProcessorTypeGPU { | |||||
| if processorType == grampus.ProcessorTypeNPU { | |||||
| command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu | |||||
| } else if processorType == grampus.ProcessorTypeGPU { | |||||
| workDir = grampus.GpuWorkDir | workDir = grampus.GpuWorkDir | ||||
| command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) | |||||
| } | } | ||||
| command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) | |||||
| //download code & dataset | //download code & dataset | ||||
| if processorType == grampus.ProcessorTypeNPU { | if processorType == grampus.ProcessorTypeNPU { | ||||
| //no need to download code & dataset by internet | //no need to download code & dataset by internet | ||||
| @@ -885,7 +890,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo | |||||
| //no need to process | //no need to process | ||||
| } else if processorType == grampus.ProcessorTypeGPU { | } else if processorType == grampus.ProcessorTypeGPU { | ||||
| unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) | unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) | ||||
| commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand | |||||
| commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand | |||||
| command += commandUnzip | command += commandUnzip | ||||
| } | } | ||||
| @@ -919,7 +924,8 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo | |||||
| var commandCode string | var commandCode string | ||||
| if processorType == grampus.ProcessorTypeNPU { | if processorType == grampus.ProcessorTypeNPU { | ||||
| commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py /tmp/log/train.log" + paramCode + ";" | |||||
| paramCode += " --model_url=" + modelRemoteObsUrl | |||||
| commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";" | |||||
| } else if processorType == grampus.ProcessorTypeGPU { | } else if processorType == grampus.ProcessorTypeGPU { | ||||
| if pretrainModelFileName != "" { | if pretrainModelFileName != "" { | ||||
| paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName | paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName | ||||
| @@ -935,8 +941,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo | |||||
| //upload models | //upload models | ||||
| if processorType == grampus.ProcessorTypeNPU { | if processorType == grampus.ProcessorTypeNPU { | ||||
| commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;" | |||||
| command += commandUpload | |||||
| // no need to upload | |||||
| } else if processorType == grampus.ProcessorTypeGPU { | } else if processorType == grampus.ProcessorTypeGPU { | ||||
| commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;" | commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;" | ||||
| command += commandUpload | command += commandUpload | ||||
| @@ -948,7 +953,6 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo | |||||
| return command, nil | return command, nil | ||||
| } | } | ||||
| func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string { | func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string { | ||||
| commandDownloadTemp := commandDownload | commandDownloadTemp := commandDownload | ||||
| if pretrainModelPath != "" { | if pretrainModelPath != "" { | ||||
| @@ -1083,6 +1087,11 @@ func SyncTaskStatus(task *models.Cloudbrain) error { | |||||
| task.CorrectCreateUnix() | task.CorrectCreateUnix() | ||||
| if oldStatus != task.Status { | if oldStatus != task.Status { | ||||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | notification.NotifyChangeCloudbrainStatus(task, oldStatus) | ||||
| if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource { | |||||
| if len(result.JobInfo.Tasks[0].CenterID) == 1 { | |||||
| urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID)) | |||||
| } | |||||
| } | |||||
| } | } | ||||
| err = models.UpdateJob(task) | err = models.UpdateJob(task) | ||||
| if err != nil { | if err != nil { | ||||