| @@ -43,9 +43,10 @@ const ( | |||
| ) | |||
| var ( | |||
| ResourceSpecs *models.ResourceSpecs | |||
| TrainResourceSpecs *models.ResourceSpecs | |||
| SpecialPools *models.SpecialPools | |||
| ResourceSpecs *models.ResourceSpecs | |||
| TrainResourceSpecs *models.ResourceSpecs | |||
| InferenceResourceSpecs *models.ResourceSpecs | |||
| SpecialPools *models.SpecialPools | |||
| ) | |||
| type GenerateCloudBrainTaskReq struct { | |||
| @@ -222,7 +223,7 @@ func AdminOrImageCreaterRight(ctx *context.Context) { | |||
| func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
| var resourceSpec *models.ResourceSpec | |||
| var versionCount int | |||
| if req.JobType == string(models.JobTypeTrain) || req.JobType == string(models.JobTypeInference) { | |||
| if req.JobType == string(models.JobTypeTrain) { | |||
| versionCount = 1 | |||
| if TrainResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) | |||
| @@ -233,6 +234,17 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
| break | |||
| } | |||
| } | |||
| } else if req.JobType == string(models.JobTypeInference) { | |||
| if InferenceResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) | |||
| } | |||
| for _, spec := range InferenceResourceSpecs.ResourceSpec { | |||
| if req.ResourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| break | |||
| } | |||
| } | |||
| } else { | |||
| if ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||
| @@ -247,21 +259,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||
| } | |||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||
| if resourceSpec == nil && SpecialPools != nil { | |||
| for _, specialPool := range SpecialPools.Pools { | |||
| if resourceSpec != nil { | |||
| break | |||
| } | |||
| if specialPool.ResourceSpec != nil { | |||
| if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { | |||
| for _, spec := range specialPool.ResourceSpec { | |||
| if req.ResourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| break | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) | |||
| } | |||
| if resourceSpec == nil { | |||
| @@ -452,6 +450,11 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
| } | |||
| } | |||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||
| if resourceSpec == nil && SpecialPools != nil { | |||
| resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||
| } | |||
| if resourceSpec == nil { | |||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||
| return errors.New("no such resourceSpec") | |||
| @@ -590,6 +593,23 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
| return nil | |||
| } | |||
| func geMatchResourceSpec(jobType string, gpuQueue string, resourceSpecId int) *models.ResourceSpec { | |||
| for _, specialPool := range SpecialPools.Pools { | |||
| if specialPool.ResourceSpec != nil { | |||
| if IsElementExist(specialPool.JobType, jobType) && IsQueueInSpecialtPool(specialPool.Pool, gpuQueue) { | |||
| for _, spec := range specialPool.ResourceSpec { | |||
| if resourceSpecId == spec.Id { | |||
| return spec | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func DelCloudBrainJob(jobId string) string { | |||
| task, err := models.GetCloudbrainByJobID(jobId) | |||
| if err != nil { | |||
| @@ -453,20 +453,22 @@ var ( | |||
| DecompressOBSTaskName string | |||
| //cloudbrain config | |||
| CBAuthUser string | |||
| CBAuthPassword string | |||
| RestServerHost string | |||
| JobPath string | |||
| CBCodePathPrefix string | |||
| JobType string | |||
| GpuTypes string | |||
| SpecialPools string | |||
| DebugServerHost string | |||
| ResourceSpecs string | |||
| MaxDuration int64 | |||
| TrainGpuTypes string | |||
| TrainResourceSpecs string | |||
| MaxDatasetNum int | |||
| CBAuthUser string | |||
| CBAuthPassword string | |||
| RestServerHost string | |||
| JobPath string | |||
| CBCodePathPrefix string | |||
| JobType string | |||
| GpuTypes string | |||
| SpecialPools string | |||
| DebugServerHost string | |||
| ResourceSpecs string | |||
| MaxDuration int64 | |||
| TrainGpuTypes string | |||
| TrainResourceSpecs string | |||
| InferenceGpuTypes string | |||
| InferenceResourceSpecs string | |||
| MaxDatasetNum int | |||
| //benchmark config | |||
| IsBenchmarkEnabled bool | |||
| @@ -1312,6 +1314,8 @@ func NewContext() { | |||
| MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | |||
| TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | |||
| TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | |||
| InferenceGpuTypes = sec.Key("INFERENCE_GPU_TYPES").MustString("") | |||
| InferenceResourceSpecs = sec.Key("INFERENCE_RESOURCE_SPECS").MustString("") | |||
| SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | |||
| MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | |||
| @@ -59,6 +59,7 @@ var ( | |||
| benchmarkGpuInfos *models.GpuInfos | |||
| benchmarkResourceSpecs *models.ResourceSpecs | |||
| trainGpuInfos *models.GpuInfos | |||
| inferenceGpuInfos *models.GpuInfos | |||
| ) | |||
| const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" | |||
| @@ -130,6 +131,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | |||
| if inferenceGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||
| } | |||
| ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo | |||
| if benchmarkGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | |||
| } | |||
| @@ -150,6 +156,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | |||
| if cloudbrain.InferenceResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
| } | |||
| ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec | |||
| if cloudbrain.SpecialPools != nil { | |||
| var debugGpuTypes []*models.GpuInfo | |||
| var trainGpuTypes []*models.GpuInfo | |||
| @@ -547,7 +558,18 @@ func CloudBrainRestart(ctx *context.Context) { | |||
| for _, resourceType := range gpuInfos.GpuInfo { | |||
| if resourceType.Queue == task.GpuQueue { | |||
| hasSameResource = true | |||
| continue | |||
| break | |||
| } | |||
| } | |||
| if !hasSameResource && cloudbrain.SpecialPools != nil { | |||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||
| cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||
| for _, pool := range specialPool.Pool { | |||
| if pool.Queue == task.GpuQueue { | |||
| hasSameResource = true | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -610,7 +632,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
| var task *models.Cloudbrain | |||
| var err error | |||
| if jobType == models.JobTypeTrain { | |||
| if jobType == models.JobTypeTrain || jobType == models.JobTypeInference { | |||
| task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) | |||
| } else { | |||
| task, err = models.GetCloudbrainByIDWithDeleted(ctx.Params(":id")) | |||
| @@ -641,6 +663,18 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
| ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||
| } | |||
| } | |||
| } else if task.JobType == string(models.JobTypeInference) { | |||
| if cloudbrain.InferenceResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
| } | |||
| for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { | |||
| if tmp.Id == task.ResourceSpecId { | |||
| ctx.Data["GpuNum"] = tmp.GpuNum | |||
| ctx.Data["CpuNum"] = tmp.CpuNum | |||
| ctx.Data["MemMiB"] = tmp.MemMiB | |||
| ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB | |||
| } | |||
| } | |||
| } else { | |||
| if cloudbrain.ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | |||
| @@ -669,6 +703,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||
| ctx.Data["resource_type"] = resourceType.Value | |||
| } | |||
| } | |||
| } else if task.JobType == string(models.JobTypeInference) { | |||
| if inferenceGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||
| } | |||
| for _, resourceType := range inferenceGpuInfos.GpuInfo { | |||
| if resourceType.Queue == jobRes.Config.GpuType { | |||
| ctx.Data["resource_type"] = resourceType.Value | |||
| } | |||
| } | |||
| } else if cloudbrain.IsBenchmarkJob(task.JobType) { | |||
| if benchmarkGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | |||
| @@ -2472,7 +2515,7 @@ func InferenceCloudBrainJobNew(ctx *context.Context) { | |||
| } | |||
| func InferenceCloudBrainJobShow(ctx *context.Context) { | |||
| cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeTrain) | |||
| cloudBrainShow(ctx, tplCloudBrainInferenceJobShow, models.JobTypeInference) | |||
| } | |||
| func DownloadInferenceResultFile(ctx *context.Context) { | |||
| @@ -119,7 +119,7 @@ func MustEnableModelArts(ctx *context.Context) { | |||
| func NotebookNew(ctx *context.Context) { | |||
| notebookNewDataPrepare(ctx) | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeDebug) | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
| ctx.Data["WaitCount"] = waitCount | |||
| ctx.HTML(200, tplModelArtsNotebookNew) | |||
| } | |||
| @@ -631,7 +631,7 @@ func TrainJobNew(ctx *context.Context) { | |||
| ctx.ServerError("get new train-job info failed", err) | |||
| return | |||
| } | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
| ctx.Data["WaitCount"] = waitCount | |||
| ctx.HTML(200, tplModelArtsTrainJobNew) | |||
| } | |||
| @@ -785,7 +785,7 @@ func TrainJobNewVersion(ctx *context.Context) { | |||
| ctx.ServerError("get new train-job info failed", err) | |||
| return | |||
| } | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeTrain) | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
| ctx.Data["WaitCount"] = waitCount | |||
| ctx.HTML(200, tplModelArtsTrainJobVersionNew) | |||
| } | |||
| @@ -2057,7 +2057,7 @@ func InferenceJobIndex(ctx *context.Context) { | |||
| PageSize: setting.UI.IssuePagingNum, | |||
| }, | |||
| RepoID: repo.ID, | |||
| Type: models.TypeCloudBrainAll, | |||
| Type: ctx.QueryInt("type"), | |||
| JobTypes: jobTypes, | |||
| }) | |||
| if err != nil { | |||
| @@ -2100,7 +2100,7 @@ func InferenceJobNew(ctx *context.Context) { | |||
| ctx.ServerError("get new inference-job info failed", err) | |||
| return | |||
| } | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "", models.JobTypeInference) | |||
| waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "") | |||
| ctx.Data["WaitCount"] = waitCount | |||
| ctx.HTML(200, tplModelArtsInferenceJobNew) | |||
| } | |||