diff --git a/models/cloudbrain.go b/models/cloudbrain.go index fa636803b..863f1573c 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -569,11 +569,12 @@ type SpecialPools struct { Pools []*SpecialPool `json:"pools"` } type SpecialPool struct { - Org string `json:"org"` - Type string `json:"type"` - IsExclusive bool `json:"isExclusive"` - Pool []*GpuInfo `json:"pool"` - JobType []string `json:"jobType"` + Org string `json:"org"` + Type string `json:"type"` + IsExclusive bool `json:"isExclusive"` + Pool []*GpuInfo `json:"pool"` + JobType []string `json:"jobType"` + ResourceSpec []*ResourceSpec `json:"resourceSpecs"` } type ImageInfosModelArts struct { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 430304dd5..6c0681aba 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -42,6 +42,7 @@ const ( var ( ResourceSpecs *models.ResourceSpecs TrainResourceSpecs *models.ResourceSpecs + SpecialPools *models.SpecialPools ) type GenerateCloudBrainTaskReq struct { @@ -222,6 +223,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { for _, spec := range TrainResourceSpecs.ResourceSpec { if req.ResourceSpecId == spec.Id { resourceSpec = spec + break } } } else { @@ -231,10 +233,29 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { for _, spec := range ResourceSpecs.ResourceSpec { if req.ResourceSpecId == spec.Id { resourceSpec = spec + break } } } + //如果没有匹配到spec信息,尝试从专属资源池获取 + if resourceSpec == nil && SpecialPools != nil { + for _, specialPool := range SpecialPools.Pools { + if resourceSpec != nil { + break + } + if specialPool.ResourceSpec != nil { + if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { + for _, spec := range specialPool.ResourceSpec { + if req.ResourceSpecId == spec.Id { + resourceSpec = spec + break + } + } + } + } + } + } if resourceSpec == nil { log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) @@ -538,3 +559,39 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e return nil } + +func InitSpecialPool() { + if SpecialPools == nil && setting.SpecialPools != "" { + json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools) + } +} + +func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool { + if resourceSpecs == nil || len(resourceSpecs) == 0 { + return true + } + for _, v := range resourceSpecs { + if v.Id == resourceSpecId { + return true + } + } + return false +} + +func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool { + for _, v := range pool { + if v.Queue == queue { + return true + } + } + return false +} + +func IsElementExist(s []string, str string) bool { + for _, v := range s { + if v == str { + return true + } + } + return false +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index bcc59bc53..d459d01d4 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -460,6 +460,7 @@ var ( CBCodePathPrefix string JobType string GpuTypes string + SpecialPools string DebugServerHost string ResourceSpecs string MaxDuration int64 @@ -1331,6 +1332,8 @@ func NewContext() { TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") MaxModelSize = sec.Key("MAX_MODEL_SIZE").MustFloat64(500) + SpecialPools = sec.Key("SPECIAL_POOL").MustString("") + MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) sec = Cfg.Section("benchmark") diff --git a/routers/api/v1/repo/cloudbrain_dashboard.go b/routers/api/v1/repo/cloudbrain_dashboard.go index cc125c97f..eb86a8293 100755 --- a/routers/api/v1/repo/cloudbrain_dashboard.go +++ b/routers/api/v1/repo/cloudbrain_dashboard.go @@ -752,10 +752,26 @@ func GetCloudbrainsDetailData(ctx *context.Context) { taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias } if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) { - WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() - taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) - if WaitTimeInt < 0 { - taskDetail.WaitTime = "00:00:00" + if ciTasks[i].Cloudbrain.DeletedAt != nilTime { + WaitTimeInt := ciTasks[i].Cloudbrain.UpdatedUnix.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } else { + if ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { + WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } else { + WaitTimeInt := ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() + taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) + if WaitTimeInt < 0 { + taskDetail.WaitTime = "00:00:00" + } + } } } else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 5a3d0a6f8..cf864001a 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2,7 +2,6 @@ package repo import ( "bufio" - "code.gitea.io/gitea/modules/grampus" "encoding/json" "errors" "fmt" @@ -16,6 +15,8 @@ import ( "time" "unicode/utf8" + "code.gitea.io/gitea/modules/grampus" + "code.gitea.io/gitea/modules/timeutil" "github.com/unknwon/i18n" @@ -149,6 +150,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType + cloudbrain.InitSpecialPool() + if gpuInfos == nil { json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) } @@ -178,6 +181,45 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) } ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + + if cloudbrain.SpecialPools != nil { + var debugGpuTypes []*models.GpuInfo + var trainGpuTypes []*models.GpuInfo + + for _, pool := range cloudbrain.SpecialPools.Pools { + org, _ := models.GetOrgByName(pool.Org) + if org != nil { + isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID) + if isOrgMember { + for _, jobType := range pool.JobType { + if jobType == string(models.JobTypeDebug) { + debugGpuTypes = append(debugGpuTypes, pool.Pool...) + if pool.ResourceSpec != nil { + ctx.Data["resource_specs"] = pool.ResourceSpec + } + } else if jobType == string(models.JobTypeTrain) { + trainGpuTypes = append(trainGpuTypes, pool.Pool...) + if pool.ResourceSpec != nil { + ctx.Data["train_resource_specs"] = pool.ResourceSpec + } + } + } + break + } + } + + } + + if len(debugGpuTypes) > 0 { + ctx.Data["gpu_types"] = debugGpuTypes + } + + if len(trainGpuTypes) > 0 { + ctx.Data["train_gpu_types"] = trainGpuTypes + } + + } + ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -217,6 +259,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { repo := ctx.Repo.Repository tpl := tplCloudBrainNew + if jobType == string(models.JobTypeTrain) { + tpl = tplCloudBrainTrainJobNew + } + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { if len(tasks) != 0 { @@ -282,6 +328,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command = commandTrain } + errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) + + if errStr != "" { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr(errStr, tpl, &form) + return + } + if branchName == "" { branchName = cloudbrain.DefaultBranchName } @@ -334,6 +388,42 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } +/** + 检查用户传输的参数是否符合专属资源池 +*/ +func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string { + if cloudbrain.SpecialPools != nil { + + var isInPoolOrg = false + var matchSpecialPool = false + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if cloudbrain.IsElementExist(specialPool.JobType, jobType) && cloudbrain.IsQueueInSpecialtPool(specialPool.Pool, queue) { + if cloudbrain.IsResourceSpecInSpecialPool(specialPool.ResourceSpec, resourceSpecId) { + matchSpecialPool = true + org, _ := models.GetOrgByName(specialPool.Org) + if org != nil { + isInPoolOrg, _ = models.IsOrganizationMember(org.ID, ctx.User.ID) + if isInPoolOrg { + break //传入参数,和专属资源池匹配上了,检查通过 + } + } + } + + } + + } + //资源池有匹配上,但是用户不在相应的组织中,返回错误信息。界面已经过滤了选择,界面操作不会到这个逻辑 + if matchSpecialPool && !isInPoolOrg { + return ctx.Tr("repo.grampus.no_operate_right") + } + + } + //没有匹配到资源池或者没有设置专属资源池,检查通过; 获取和资源池完全匹配检查通过 + return "" +} + func CloudBrainRestart(ctx *context.Context) { var ID = ctx.Params(":id") var resultCode = "0" @@ -573,7 +663,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo if task.TrainJobDuration == "" { if task.Duration == 0 { var duration int64 - if task.Status == string(models.JobRunning) { + if task.Status == string(models.JobWaiting) { + duration = 0 + } else if task.Status == string(models.JobRunning) { duration = time.Now().Unix() - int64(task.CreatedUnix) } else { duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl index b7fcd36ad..886469d4c 100644 --- a/templates/repo/modelarts/trainjob/version_new.tmpl +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -446,24 +446,6 @@ ] }, - work_server_number: { - identifier : 'work_server_number', - rules: [ - { - type : 'integer[1..25]', - prompt : '计算节点需要在1-25之间,请您键入正确的值' - } - ] - }, - run_para_list:{ - identifier : 'run_para_list', - rules: [ - { - type: 'maxLength[255]', - prompt : '所有字符最长不超过255个字符。' - } - ] - }, }, }) @@ -512,24 +494,6 @@ ] }, - work_server_number: { - identifier : 'work_server_number', - rules: [ - { - type : 'integer[1..25]', - prompt : '计算节点需要在1-25之间,请您键入正确的值' - } - ] - }, - run_para_list:{ - identifier : 'run_para_list', - rules: [ - { - type: 'maxLength[255]', - prompt : '所有字符最长不超过255个字符。' - } - ] - }, }, onSuccess: function(){ // $('.ui.page.dimmer').dimmer('show')