| @@ -43,9 +43,10 @@ const ( | |||||
| ) | ) | ||||
| var ( | var ( | ||||
| ResourceSpecs *models.ResourceSpecs | |||||
| TrainResourceSpecs *models.ResourceSpecs | |||||
| SpecialPools *models.SpecialPools | |||||
| ResourceSpecs *models.ResourceSpecs | |||||
| TrainResourceSpecs *models.ResourceSpecs | |||||
| InferenceResourceSpecs *models.ResourceSpecs | |||||
| SpecialPools *models.SpecialPools | |||||
| ) | ) | ||||
| type GenerateCloudBrainTaskReq struct { | type GenerateCloudBrainTaskReq struct { | ||||
| @@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) { | |||||
| func GenerateTask(req GenerateCloudBrainTaskReq) error { | func GenerateTask(req GenerateCloudBrainTaskReq) error { | ||||
| var resourceSpec *models.ResourceSpec | var resourceSpec *models.ResourceSpec | ||||
| var versionCount int | var versionCount int | ||||
| if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) { | |||||
| if req.JobType == string(models.JobTypeTrain) { | |||||
| versionCount = 1 | versionCount = 1 | ||||
| if TrainResourceSpecs == nil { | if TrainResourceSpecs == nil { | ||||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | ||||
| @@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
| break | break | ||||
| } | } | ||||
| } | } | ||||
| } else if req.JobType == string(models.JobTypeInference) { | |||||
| if InferenceResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) | |||||
| } | |||||
| for _, spec := range InferenceResourceSpecs.ResourceSpec { | |||||
| if req.ResourceSpecId == spec.Id { | |||||
| resourceSpec = spec | |||||
| break | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| if ResourceSpecs == nil { | if ResourceSpecs == nil { | ||||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | ||||
| @@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
| } | } | ||||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | //如果没有匹配到spec信息,尝试从专属资源池获取 | ||||
| if resourceSpec == nil && SpecialPools != nil { | if resourceSpec == nil && SpecialPools != nil { | ||||
| for _, specialPool := range SpecialPools.Pools { | |||||
| if resourceSpec != nil { | |||||
| break | |||||
| } | |||||
| if specialPool.ResourceSpec != nil { | |||||
| if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { | |||||
| for _, spec := range specialPool.ResourceSpec { | |||||
| if req.ResourceSpecId == spec.Id { | |||||
| resourceSpec = spec | |||||
| break | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) | |||||
| } | } | ||||
| if resourceSpec == nil { | if resourceSpec == nil { | ||||
| @@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| } | } | ||||
| } | } | ||||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||||
| if resourceSpec == nil && SpecialPools != nil { | |||||
| resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||||
| } | |||||
| if resourceSpec == nil { | if resourceSpec == nil { | ||||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | ||||
| return errors.New("no such resourceSpec") | return errors.New("no such resourceSpec") | ||||
| @@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| return nil | return nil | ||||
| } | } | ||||
| func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec { | |||||
| for _, specialPool := range SpecialPools.Pools { | |||||
| if specialPool.ResourceSpec != nil { | |||||
| if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) { | |||||
| for _, spec := range specialPool.ResourceSpec { | |||||
| if resourceSpecId == spec.Id { | |||||
| return spec | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func DelCloudBrainJob(jobId string) string { | func DelCloudBrainJob(jobId string) string { | ||||
| task, err := models.GetCloudbrainByJobID(jobId) | task, err := models.GetCloudbrainByJobID(jobId) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -453,20 +453,22 @@ var ( | |||||
| DecompressOBSTaskName string | DecompressOBSTaskName string | ||||
| //cloudbrain config | //cloudbrain config | ||||
| CBAuthUser string | |||||
| CBAuthPassword string | |||||
| RestServerHost string | |||||
| JobPath string | |||||
| CBCodePathPrefix string | |||||
| JobType string | |||||
| GpuTypes string | |||||
| SpecialPools string | |||||
| DebugServerHost string | |||||
| ResourceSpecs string | |||||
| MaxDuration int64 | |||||
| TrainGpuTypes string | |||||
| TrainResourceSpecs string | |||||
| MaxDatasetNum int | |||||
| CBAuthUser string | |||||
| CBAuthPassword string | |||||
| RestServerHost string | |||||
| JobPath string | |||||
| CBCodePathPrefix string | |||||
| JobType string | |||||
| GpuTypes string | |||||
| SpecialPools string | |||||
| DebugServerHost string | |||||
| ResourceSpecs string | |||||
| MaxDuration int64 | |||||
| TrainGpuTypes string | |||||
| TrainResourceSpecs string | |||||
| InferenceGpuTypes string | |||||
| InferenceResourceSpecs string | |||||
| MaxDatasetNum int | |||||
| //benchmark config | //benchmark config | ||||
| IsBenchmarkEnabled bool | IsBenchmarkEnabled bool | ||||
| @@ -1312,6 +1314,8 @@ func NewContext() { | |||||
| MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | ||||
| TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | ||||
| TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | ||||
| InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("") | |||||
| InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("") | |||||
| SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | ||||
| MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | ||||
| @@ -59,6 +59,7 @@ var ( | |||||
| benchmarkGpuInfos *models.GpuInfos | benchmarkGpuInfos *models.GpuInfos | ||||
| benchmarkResourceSpecs *models.ResourceSpecs | benchmarkResourceSpecs *models.ResourceSpecs | ||||
| trainGpuInfos *models.GpuInfos | trainGpuInfos *models.GpuInfos | ||||
| inferenceGpuInfos *models.GpuInfos | |||||
| ) | ) | ||||
| const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | ||||
| @@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
| } | } | ||||
| ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | ||||
| if inferenceGpuInfos == nil { | |||||
| json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||||
| } | |||||
| ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo | |||||
| if benchmarkGpuInfos == nil { | if benchmarkGpuInfos == nil { | ||||
| json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | ||||
| } | } | ||||
| @@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
| } | } | ||||
| ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ||||
| if cloudbrain.InferenceResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||||
| } | |||||
| ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec | |||||
| if cloudbrain.SpecialPools != nil { | if cloudbrain.SpecialPools != nil { | ||||
| var debugGpuTypes []*models.GpuInfo | var debugGpuTypes []*models.GpuInfo | ||||
| var trainGpuTypes []*models.GpuInfo | var trainGpuTypes []*models.GpuInfo | ||||
| @@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
| for _, resourceType := range gpuInfos.GpuInfo { | for _, resourceType := range gpuInfos.GpuInfo { | ||||
| if resourceType.Queue == task.GpuQueue { | if resourceType.Queue == task.GpuQueue { | ||||
| hasSameResource = true | hasSameResource = true | ||||
| continue | |||||
| break | |||||
| } | |||||
| } | |||||
| if !hasSameResource && cloudbrain.SpecialPools != nil { | |||||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||||
| cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||||
| for _, pool := range specialPool.Pool { | |||||
| if pool.Queue == task.GpuQueue { | |||||
| hasSameResource = true | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
| var task *models.Cloudbrain | var task *models.Cloudbrain | ||||
| var err error | var err error | ||||
| if jobType == models.JobTypeTrain { | |||||
| if jobType == models.JobTypeTrain || jobType == models.JobTypeInference { | |||||
| task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | ||||
| } else { | } else { | ||||
| task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | ||||
| @@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
| ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | ||||
| } | } | ||||
| } | } | ||||
| } else if task.JobType == string(models.JobTypeInference) { | |||||
| if cloudbrain.InferenceResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||||
| } | |||||
| for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { | |||||
| if tmp.Id == task.ResourceSpecId { | |||||
| ctx.Data["GpuNum"] = tmp.GpuNum | |||||
| ctx.Data["CpuNum"] = tmp.CpuNum | |||||
| ctx.Data["MemMiB"] = tmp.MemMiB | |||||
| ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||||
| } | |||||
| } | |||||
| } else { | } else { | ||||
| if cloudbrain.ResourceSpecs == nil { | if cloudbrain.ResourceSpecs == nil { | ||||
| json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | ||||
| @@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
| ctx.Data["resource_type"] = resourceType.Value | ctx.Data["resource_type"] = resourceType.Value | ||||
| } | } | ||||
| } | } | ||||
| } else if task.JobType == string(models.JobTypeInference) { | |||||
| if inferenceGpuInfos == nil { | |||||
| json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||||
| } | |||||
| for _, resourceType := range inferenceGpuInfos.GpuInfo { | |||||
| if resourceType.Queue == jobRes.Config.GpuType { | |||||
| ctx.Data["resource_type"] = resourceType.Value | |||||
| } | |||||
| } | |||||
| } else if cloudbrain.IsBenchmarkJob(task.JobType) { | } else if cloudbrain.IsBenchmarkJob(task.JobType) { | ||||
| if benchmarkGpuInfos == nil { | if benchmarkGpuInfos == nil { | ||||
| json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | ||||
| @@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) { | |||||
| } | } | ||||
| func InferenceCloudBrainJobShow(ctx *context.Context) { | func InferenceCloudBrainJobShow(ctx *context.Context) { | ||||
| cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain) | |||||
| cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference) | |||||
| } | } | ||||
| func DownloadInferenceResultFile(ctx *context.Context) { | func DownloadInferenceResultFile(ctx *context.Context) { | ||||
| @@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) { | |||||
| func NotebookNew(ctx *context.Context) { | func NotebookNew(ctx *context.Context) { | ||||
| notebookNewDataPrepare(ctx) | notebookNewDataPrepare(ctx) | ||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug) | |||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
| ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
| ctx.HTML(200, tplModelArtsNotebookNew) | ctx.HTML(200, tplModelArtsNotebookNew) | ||||
| } | } | ||||
| @@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) { | |||||
| ctx.ServerError("get new train-job info failed", err) | ctx.ServerError("get new train-job info failed", err) | ||||
| return | return | ||||
| } | } | ||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
| ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
| ctx.HTML(200, tplModelArtsTrainJobNew) | ctx.HTML(200, tplModelArtsTrainJobNew) | ||||
| } | } | ||||
| @@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) { | |||||
| ctx.ServerError("get new train-job info failed", err) | ctx.ServerError("get new train-job info failed", err) | ||||
| return | return | ||||
| } | } | ||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
| ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
| ctx.HTML(200, tplModelArtsTrainJobVersionNew) | ctx.HTML(200, tplModelArtsTrainJobVersionNew) | ||||
| } | } | ||||
| @@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) { | |||||
| PageSize: setting.UI.IssuePagingNum, | PageSize: setting.UI.IssuePagingNum, | ||||
| }, | }, | ||||
| RepoID: repo.ID, | RepoID: repo.ID, | ||||
| Type: models.TypeCloudBrainAll, | |||||
| Type: ctx.QueryInt("type"), | |||||
| JobTypes: jobTypes, | JobTypes: jobTypes, | ||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) { | |||||
| ctx.ServerError("get new inference-job info failed", err) | ctx.ServerError("get new inference-job info failed", err) | ||||
| return | return | ||||
| } | } | ||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference) | |||||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||||
| ctx.Data["WaitCount"] = waitCount | ctx.Data["WaitCount"] = waitCount | ||||
| ctx.HTML(200, tplModelArtsInferenceJobNew) | ctx.HTML(200, tplModelArtsInferenceJobNew) | ||||
| } | } | ||||