diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9e8447978..97791e25a 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -71,7 +71,8 @@ var ( FlavorInfos *models.FlavorInfos ImageInfos *models.ImageInfosModelArts TrainFlavorInfos *Flavor - SpecialPools *models.SpecialPools + SpecialPools *models.SpecialPools + MultiNodeConfig *MultiNodes ) type GenerateTrainJobReq struct { @@ -166,6 +167,14 @@ type ResourcePool struct { } `json:"resource_pool"` } +type MultiNodes struct{ + Info []OrgMultiNode `json:"multinode"` +} +type OrgMultiNode struct{ + Org string `json:"org"` + Node []int `json:"node"` +} + // type Parameter struct { // Label string `json:"label"` // Value string `json:"value"` @@ -773,6 +782,13 @@ func InitSpecialPool() { } } +func InitMultiNode(){ + if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{ + json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig) + } + +} + func HandleTrainJobInfo(task *models.Cloudbrain) error { result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 1e96ff9da..3b8a1d8cf 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -547,6 +547,7 @@ var ( FlavorInfos string TrainJobFLAVORINFOS string ModelArtsSpecialPools string + ModelArtsMultiNode string //grampus config Grampus = struct { @@ -1432,6 +1433,7 @@ func NewContext() { FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("") TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("") ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("") + ModelArtsMultiNode=sec.Key("MULTI_NODE").MustString("") sec = Cfg.Section("elk") ElkUrl = sec.Key("ELKURL").MustString("") diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 5eac4cf2e..3453344f7 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1213,6 +1213,7 @@ modelarts.infer_job.select_model = Select Model modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py. modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed. modelarts.download_log=Download log file +modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'. debug_task_not_created = Debug task has not been created diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 2fbd3ab52..d527218d3 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1226,6 +1226,7 @@ modelarts.infer_job.select_model = 选择模型 modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。 modelarts.infer_job.tooltip = 该模型已删除,无法查看。 modelarts.download_log=下载日志文件 +modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。 debug_task_not_created = 未创建过调试任务 diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 847e831f6..e962db0a1 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -763,9 +763,23 @@ func trainJobNewDataPrepare(ctx *context.Context) error { waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) + return nil } +func setMultiNodeIfConfigureMatch(ctx *context.Context) { + modelarts.InitMultiNode() + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { + ctx.Data["WorkNode"] = info.Node + break + } + } + } +} + func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { modelarts.InitSpecialPool() @@ -880,6 +894,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts ctx.Data["datasetType"] = models.TypeCloudBrainTwo waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") ctx.Data["WaitCount"] = waitCount + setMultiNodeIfConfigureMatch(ctx) return nil } @@ -1115,6 +1130,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCountOne EngineName := form.EngineName + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1145,7 +1167,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { trainJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) @@ -1349,6 +1371,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func checkMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + modelarts.InitMultiNode() + var isServerNumValid=false + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { + if isInNodes(info.Node,serverNum){ + isServerNumValid=true + break + } + + } + } + } + if isServerNumValid{ + return "" + }else{ + return "repo.modelarts.no_node_right" + } +} +func checkInferenceJobMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + + return "repo.modelarts.no_node_right" + +} + +func isInNodes(nodes []int, num int) bool { + for _, node:=range nodes{ + if node==num{ + return true + } + } + return false + +} + func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { userImageUrl := "" userCommand := "" @@ -1383,6 +1447,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + versionErrorDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1450,7 +1521,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { versionErrorDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) @@ -1718,11 +1789,7 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile)) return errors.New("启动文件必须是python文件") } - - if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 { - log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber) - return errors.New("计算节点数必须在1-2之间") - } + if form.BranchName == "" { log.Error("the branch must not be null!", form.BranchName) return errors.New("代码分支不能为空!") @@ -2021,6 +2088,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ckptUrl := "/" + form.TrainUrl + form.CkptName log.Info("ckpt url:" + ckptUrl) + errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + inferenceJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + return + } + count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -2069,7 +2143,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) if errStr != "" { inferenceJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index 7818938d3..cc1c0d7f1 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -287,8 +287,24 @@ id="trainjob_work_server_num" tabindex="3" autofocus required maxlength="255" value="1" readonly>