Browse Source

Merge pull request 'issue 2840' (#2842) from V20220830.patch into V20220908

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/2842
Reviewed-by: lewis <747342561@qq.com>
tags/v1.22.9.1^2
lewis 3 years ago
parent
commit
a1c5dfa246
6 changed files with 120 additions and 10 deletions
  1. +17
    -1
      modules/modelarts/modelarts.go
  2. +2
    -0
      modules/setting/setting.go
  3. +1
    -0
      options/locale/locale_en-US.ini
  4. +1
    -0
      options/locale/locale_zh-CN.ini
  5. +82
    -8
      routers/repo/modelarts.go
  6. +17
    -1
      templates/repo/modelarts/trainjob/new.tmpl

+ 17
- 1
modules/modelarts/modelarts.go View File

@@ -71,7 +71,8 @@ var (
FlavorInfos *models.FlavorInfos
ImageInfos *models.ImageInfosModelArts
TrainFlavorInfos *Flavor
SpecialPools *models.SpecialPools
SpecialPools *models.SpecialPools
MultiNodeConfig *MultiNodes
)

type GenerateTrainJobReq struct {
@@ -166,6 +167,14 @@ type ResourcePool struct {
} `json:"resource_pool"`
}

type MultiNodes struct{
Info []OrgMultiNode `json:"multinode"`
}
type OrgMultiNode struct{
Org string `json:"org"`
Node []int `json:"node"`
}

// type Parameter struct {
// Label string `json:"label"`
// Value string `json:"value"`
@@ -773,6 +782,13 @@ func InitSpecialPool() {
}
}

func InitMultiNode(){
if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{
json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
}

}

func HandleTrainJobInfo(task *models.Cloudbrain) error {

result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))


+ 2
- 0
modules/setting/setting.go View File

@@ -547,6 +547,7 @@ var (
FlavorInfos string
TrainJobFLAVORINFOS string
ModelArtsSpecialPools string
ModelArtsMultiNode string

//grampus config
Grampus = struct {
@@ -1432,6 +1433,7 @@ func NewContext() {
FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("")
TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("")
ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("")
ModelArtsMultiNode=sec.Key("MULTI_NODE").MustString("")

sec = Cfg.Section("elk")
ElkUrl = sec.Key("ELKURL").MustString("")


+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -1213,6 +1213,7 @@ modelarts.infer_job.select_model = Select Model
modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py.
modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed.
modelarts.download_log=Download log file
modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'.


debug_task_not_created = Debug task has not been created


+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -1226,6 +1226,7 @@ modelarts.infer_job.select_model = 选择模型
modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。
modelarts.infer_job.tooltip = 该模型已删除,无法查看。
modelarts.download_log=下载日志文件
modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。


debug_task_not_created = 未创建过调试任务


+ 82
- 8
routers/repo/modelarts.go View File

@@ -763,9 +763,23 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount

setMultiNodeIfConfigureMatch(ctx)
return nil
}

func setMultiNodeIfConfigureMatch(ctx *context.Context) {
modelarts.InitMultiNode()
if modelarts.MultiNodeConfig != nil {
for _, info := range modelarts.MultiNodeConfig.Info {
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg {
ctx.Data["WorkNode"] = info.Node
break
}
}
}
}

func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) {
modelarts.InitSpecialPool()

@@ -880,6 +894,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts
ctx.Data["datasetType"] = models.TypeCloudBrainTwo
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
ctx.Data["WaitCount"] = waitCount
setMultiNodeIfConfigureMatch(ctx)

return nil
}
@@ -1115,6 +1130,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
VersionCount := modelarts.VersionCountOne
EngineName := form.EngineName

errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber)
if errStr!=""{
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
return
}

count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -1145,7 +1167,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
@@ -1349,6 +1371,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func checkMultiNode(userId int64, serverNum int) string{
if serverNum==1{
return ""
}
modelarts.InitMultiNode()
var isServerNumValid=false
if modelarts.MultiNodeConfig != nil {
for _, info := range modelarts.MultiNodeConfig.Info {
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg {
if isInNodes(info.Node,serverNum){
isServerNumValid=true
break
}

}
}
}
if isServerNumValid{
return ""
}else{
return "repo.modelarts.no_node_right"
}
}
func checkInferenceJobMultiNode(userId int64, serverNum int) string{
if serverNum==1{
return ""
}

return "repo.modelarts.no_node_right"

}

func isInNodes(nodes []int, num int) bool {
for _, node:=range nodes{
if node==num{
return true
}
}
return false

}

func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) {
userImageUrl := ""
userCommand := ""
@@ -1383,6 +1447,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")

errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber)
if errStr!=""{
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
return
}

count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -1450,7 +1521,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
@@ -1718,11 +1789,7 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
return errors.New("启动文件必须是python文件")
}

if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 {
log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber)
return errors.New("计算节点数必须在1-2之间")
}
if form.BranchName == "" {
log.Error("the branch must not be null!", form.BranchName)
return errors.New("代码分支不能为空!")
@@ -2021,6 +2088,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
ckptUrl := "/" + form.TrainUrl + form.CkptName
log.Info("ckpt url:" + ckptUrl)

errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber)
if errStr!=""{
inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)
return
}

count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -2069,7 +2143,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
}
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference))
errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference))
if errStr != "" {
inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)


+ 17
- 1
templates/repo/modelarts/trainjob/new.tmpl View File

@@ -287,8 +287,24 @@
id="trainjob_work_server_num" tabindex="3" autofocus required maxlength="255" value="1"
readonly>
<div class="field" id="trainjob_work_server_num_select" name="work_server_number_select">
<select class="ui dropdown width" style='width: 100%;' name="work_server_id">
<select class="ui dropdown width" style='width: 100%;' name="work_server_id">
{{if .WorkNode}}
{{range .WorkNode}}

{{if $.work_server_number}}
{{if eq . $.work_server_number }}
<option name="server_id" selected value="{{.}}">{{.}}</option>
{{else}}
<option name="server_id" value="{{.}}">{{.}}</option>
{{end}}
{{else}}
<option name="server_id" value="{{.}}">{{.}}</option>
{{end}}
{{end}}

{{else}}
<option name="server_id" value="1">1</option>
{{end}}
</select>
</div>



Loading…
Cancel
Save