diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 30f080335..25c556278 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -20,7 +20,7 @@ import ( const ( //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` - CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` + CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh | tee /model/benchmark-log.txt;echo "end benchmark"` CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" @@ -30,8 +30,8 @@ const ( Snn4imagenetMountPath = "/snn4imagenet" BrainScoreMountPath = "/brainscore" TaskInfoName = "/taskInfo" - Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'` - BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'` + Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s' | tee /model/benchmark-log.txt` + BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s' | tee /model/benchmark-log.txt` SubTaskName = "task1" diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index dcdf9273c..7cf0861fd 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -70,7 +70,8 @@ const ( var ( poolInfos *models.PoolInfos TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools + SpecialPools *models.SpecialPools + MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { @@ -165,6 +166,14 @@ type ResourcePool struct { } `json:"resource_pool"` } +type MultiNodes struct{ + Info []OrgMultiNode `json:"multinode"` +} +type OrgMultiNode struct{ + Org string `json:"org"` + Node []int `json:"node"` +} + // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` @@ -768,6 +777,13 @@ func InitSpecialPool() { } } +func InitMultiNode(){ + if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{ + json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) + } + +} + func HandleTrainJobInfo(task *models.Cloudbrain) error { result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 0bc47aa70..3853c23f5 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -568,6 +568,7 @@ var ( EngineVersions string TrainJobFLAVORINFOS string ModelArtsSpecialPools string + ModelArtsMultiNode string // modelarts-cd config ModelartsCD = struct { @@ -1463,6 +1464,7 @@ func NewContext() { FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("") ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("") + ModelArtsMultiNode=sec.Key("MULTI_NODE").MustString("") sec = Cfg.Section("elk") ElkUrl = sec.Key("ELKURL").MustString("") diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index adb5eebf5..f6ff5d96f 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1214,6 +1214,7 @@ modelarts.infer_job.select_model = Select Model modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py. modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed. modelarts.download_log=Download log file +modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'. debug_task_not_created = Debug task has not been created diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index c8daeb1be..17ec29319 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1227,6 +1227,7 @@ modelarts.infer_job.select_model = 选择模型 modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。 modelarts.infer_job.tooltip = 该模型已删除,无法查看。 modelarts.download_log=下载日志文件 +modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。 debug_task_not_created = 未创建过调试任务 diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index b450b2e26..d6b7bb076 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -405,52 +405,159 @@ func CloudbrainDownloadLogFile(ctx *context.Context) { func CloudbrainGetLog(ctx *context.Context) { ID := ctx.Params(":id") - startLine := ctx.QueryInt("base_line") - lines := ctx.QueryInt("lines") - endLine := startLine + lines - order := ctx.Query("order") - if order == "asc" { - endLine = startLine - startLine = endLine - lines - if startLine < 0 { - startLine = 0 - } - } job, err := models.GetCloudbrainByID(ID) if err != nil { log.Error("GetCloudbrainByJobName failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) return } - result := getLogFromModelDir(job.JobName, startLine, endLine) - if result == nil { - log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) - ctx.ServerError(err.Error(), err) - return + lines := ctx.QueryInt("lines") + baseLine := ctx.Query("base_line") + order := ctx.Query("order") + var result map[string]interface{} + resultPath := "/model" + if job.JobType == string(models.JobTypeInference) { + resultPath = "/result" + } + if baseLine == "" && order == "desc" { + result = getLastLogFromModelDir(job.JobName, lines, resultPath) + } else { + startLine := ctx.QueryInt("base_line") + endLine := startLine + lines + if order == "asc" { + if baseLine == "" { + startLine = 0 + endLine = lines + } else { + endLine = startLine + startLine = endLine - lines + if startLine < 0 { + startLine = 0 + } + } + } + result = getLogFromModelDir(job.JobName, startLine, endLine, resultPath) + if result == nil { + log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } } - re := map[string]interface{}{ "JobID": ID, "LogFileName": result["FileName"], - "StartLine": startLine, - "EndLine": result["endLine"], + "StartLine": result["StartLine"], + "EndLine": result["EndLine"], "Content": result["Content"], - "Lines": result["lines"], + "Lines": result["Lines"], "CanLogDownload": result["FileName"] != "", } //result := CloudbrainGetLogByJobId(job.JobID, job.JobName) - ctx.JSON(http.StatusOK, re) } -func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]interface{} { - prefix := "/" + setting.CBCodePathPrefix + jobName + "/model" +func getAllLineFromFile(path string) int { + count := 0 + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for { + _, error := r.ReadString('\n') + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + count = count + 1 + } + } else { + log.Info("error:" + err.Error()) + } + return count +} + +func getLastLogFromModelDir(jobName string, lines int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") if err != nil { log.Error("query cloudbrain model failed: %v", err) return nil } + re := "" + fileName := "" + count := 0 + allLines := 0 + startLine := 0 + for _, file := range files { + if strings.HasSuffix(file.FileName, "log.txt") { + fileName = file.FileName + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) + allLines = getAllLineFromFile(path) + startLine = allLines - lines + if startLine < 0 { + startLine = 0 + } + count = allLines - startLine + log.Info("path=" + path) + reader, err := os.Open(path) + defer reader.Close() + if err == nil { + r := bufio.NewReader(reader) + for i := 0; i < allLines; i++ { + line, error := r.ReadString('\n') + if error == io.EOF { + log.Info("read file completed.") + break + } + if error != nil { + log.Info("read file error." + error.Error()) + break + } + if error == nil { + if i >= startLine { + re = re + line + } + } + } + } else { + log.Info("error:" + err.Error()) + } + break + } + } + + return map[string]interface{}{ + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": allLines, + "StartLine": startLine, + } +} + +func getLogFromModelDir(jobName string, startLine int, endLine int, resultPath string) map[string]interface{} { + prefix := "/" + setting.CBCodePathPrefix + jobName + resultPath + files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "") + if err != nil { + log.Error("query cloudbrain model failed: %v", err) + return nil + } + if startLine == endLine { + return map[string]interface{}{ + "JobName": jobName, + "Content": "", + "FileName": "", + "Lines": 0, + "EndLine": startLine, + "StartLine": startLine, + } + } re := "" fileName := "" count := 0 @@ -458,7 +565,7 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i for _, file := range files { if strings.HasSuffix(file.FileName, "log.txt") { fileName = file.FileName - path := storage.GetMinioPath(jobName+"/model/", file.FileName) + path := storage.GetMinioPath(jobName+resultPath+"/", file.FileName) log.Info("path=" + path) reader, err := os.Open(path) defer reader.Close() @@ -467,7 +574,6 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i for i := 0; i < endLine; i++ { line, error := r.ReadString('\n') log.Info("line=" + line) - fileEndLine = i if error == io.EOF { log.Info("read file completed.") break @@ -478,11 +584,13 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } if error == nil { if i >= startLine { + fileEndLine = i re = re + line count++ } } } + fileEndLine = fileEndLine + 1 } else { log.Info("error:" + err.Error()) } @@ -491,11 +599,12 @@ func getLogFromModelDir(jobName string, startLine int, endLine int) map[string]i } return map[string]interface{}{ - "JobName": jobName, - "Content": re, - "FileName": fileName, - "lines": count, - "endLine": fileEndLine, + "JobName": jobName, + "Content": re, + "FileName": fileName, + "Lines": count, + "EndLine": fileEndLine, + "StartLine": startLine, } } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 2e35ac3cc..b8e2640af 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2431,7 +2431,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, &form) return } - + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2560,7 +2561,8 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } - + log.Info("Command=" + command) + log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -2689,7 +2691,7 @@ func getInferenceJobCommand(form auth.CreateCloudBrainInferencForm) (string, err param += " --modelname" + "=" + form.CkptName - command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile + command += "python /code/" + bootFile + param + " | tee " + cloudbrain.ResultPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile return command, nil } @@ -2718,7 +2720,7 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { } } - command += "python /code/" + bootFile + param + " | tee " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile + command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile return command, nil } diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index bff00f0c5..83a929f07 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -712,9 +712,23 @@ func trainJobNewDataPrepare(ctx *context.Context) error { waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) + return nil } +func setMultiNodeIfConfigureMatch(ctx *context.Context) { + modelarts.InitMultiNode() + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { + ctx.Data["WorkNode"] = info.Node + break + } + } + } +} + func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { modelarts.InitSpecialPool() @@ -829,6 +843,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) return nil } @@ -1064,6 +1079,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCountOne EngineName := form.EngineName + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1094,7 +1116,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { trainJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) @@ -1298,6 +1320,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func checkMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + modelarts.InitMultiNode() + var isServerNumValid=false + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { + if isInNodes(info.Node,serverNum){ + isServerNumValid=true + break + } + + } + } + } + if isServerNumValid{ + return "" + }else{ + return "repo.modelarts.no_node_right" + } +} +func checkInferenceJobMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + + return "repo.modelarts.no_node_right" + +} + +func isInNodes(nodes []int, num int) bool { + for _, node:=range nodes{ + if node==num{ + return true + } + } + return false + +} + func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { userImageUrl := "" userCommand := "" @@ -1332,6 +1396,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + versionErrorDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1399,7 +1470,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { versionErrorDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) @@ -1667,11 +1738,7 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile)) return errors.New("启动文件必须是python文件") } - - if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 { - log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber) - return errors.New("计算节点数必须在1-2之间") - } + if form.BranchName == "" { log.Error("the branch must not be null!", form.BranchName) return errors.New("代码分支不能为空!") @@ -1970,6 +2037,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ckptUrl := "/" + form.TrainUrl + form.CkptName log.Info("ckpt url:" + ckptUrl) + errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + inferenceJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + return + } + count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -2018,7 +2092,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) if errStr != "" { inferenceJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) diff --git a/templates/repo/cloudbrain/benchmark/show.tmpl b/templates/repo/cloudbrain/benchmark/show.tmpl index add7d34d4..d657ac2ff 100755 --- a/templates/repo/cloudbrain/benchmark/show.tmpl +++ b/templates/repo/cloudbrain/benchmark/show.tmpl @@ -256,8 +256,9 @@
+ + + + diff --git a/templates/repo/cloudbrain/inference/show.tmpl b/templates/repo/cloudbrain/inference/show.tmpl index 3ec01417e..6d0cee642 100644 --- a/templates/repo/cloudbrain/inference/show.tmpl +++ b/templates/repo/cloudbrain/inference/show.tmpl @@ -228,7 +228,7 @@ {{with .task}}