diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 47d09d9ce..b987ca0aa 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -122,6 +122,11 @@ const ( //AI center AICenterOfCloudBrainOne = "OpenIOne" AICenterOfCloudBrainTwo = "OpenITwo" + AICenterOfChengdu = "OpenIChengdu" + + //ComputeResource + GPU = "GPU" + NPU = "NPU" ) type Cloudbrain struct { @@ -192,6 +197,7 @@ type Cloudbrain struct { BenchmarkTypeRankLink string `xorm:"-"` StartTime timeutil.TimeStamp EndTime timeutil.TimeStamp + Spec *Specification `xorm:"-"` } func (task *Cloudbrain) ComputeAndSetDuration() { @@ -1708,11 +1714,24 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e } func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { + session := x.NewSession() + defer session.Close() + + err = session.Begin() cloudbrain.TrainJobDuration = DURATION_STR_ZERO - if _, err = x.NoAutoTime().Insert(cloudbrain); err != nil { + if _, err = session.NoAutoTime().InsertOne(cloudbrain); err != nil { + session.Rollback() return err } + if cloudbrain.Spec != nil { + if _, err = session.Insert(NewCloudBrainSpec(cloudbrain.ID, *cloudbrain.Spec)); err != nil { + session.Rollback() + return err + } + } + session.Commit() + go IncreaseDatasetUseCount(cloudbrain.Uuid) return nil } @@ -1991,11 +2010,18 @@ func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { return err } - if _, err = sess.NoAutoTime().Insert(new); err != nil { + if _, err = sess.NoAutoTime().InsertOne(new); err != nil { sess.Rollback() return err } + if new.Spec != nil { + if _, err = sess.Insert(NewCloudBrainSpec(new.ID, *new.Spec)); err != nil { + sess.Rollback() + return err + } + } + if err = sess.Commit(); err != nil { return err } @@ -2387,7 +2413,57 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { Find(&cloudbrains) } +func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0) + return cloudbrains, x. + In("id", ids).Unscoped().Find(&cloudbrains) +} + func GetCloudbrainCountByJobName(jobName, jobType string, typeCloudbrain int) (int, error) { count, err := x.Where("job_name = ? and job_type= ? and type = ?", jobName, jobType, typeCloudbrain).Count(new(Cloudbrain)) return int(count), err } + +func LoadSpecs(tasks []*Cloudbrain) error { + cloudbrainIds := make([]int64, len(tasks)) + for i, v := range tasks { + cloudbrainIds[i] = v.ID + } + specs := make([]*CloudbrainSpec, 0) + err := x.In("cloudbrain_id", cloudbrainIds).Find(&specs) + if err != nil { + return err + } + specMap := make(map[int64]*CloudbrainSpec) + for _, v := range specs { + specMap[v.SpecId] = v + } + for _, v := range tasks { + if specMap[v.ID] != nil { + v.Spec = specMap[v.ID].ConvertToSpecification() + } + } + return nil +} + +func LoadSpecs4CloudbrainInfo(tasks []*CloudbrainInfo) error { + cloudbrainIds := make([]int64, len(tasks)) + for i, v := range tasks { + cloudbrainIds[i] = v.Cloudbrain.ID + } + specs := make([]*CloudbrainSpec, 0) + err := x.In("cloudbrain_id", cloudbrainIds).Find(&specs) + if err != nil { + return err + } + specMap := make(map[int64]*CloudbrainSpec) + for _, v := range specs { + specMap[v.CloudbrainID] = v + } + for _, v := range tasks { + if specMap[v.Cloudbrain.ID] != nil { + v.Cloudbrain.Spec = specMap[v.Cloudbrain.ID].ConvertToSpecification() + } + } + return nil +} diff --git a/models/cloudbrain_spec.go b/models/cloudbrain_spec.go new file mode 100644 index 000000000..8aa652b17 --- /dev/null +++ b/models/cloudbrain_spec.go @@ -0,0 +1,109 @@ +package models + +import ( + "code.gitea.io/gitea/modules/timeutil" +) + +type CloudbrainSpec struct { + CloudbrainID int64 `xorm:"pk"` + SpecId int64 `xorm:"index"` + SourceSpecId string + AccCardsNum int + AccCardType string + CpuCores int + MemGiB float32 + GPUMemGiB float32 + ShareMemGiB float32 + ComputeResource string + UnitPrice int + QueueId int64 + QueueCode string + Cluster string + AiCenterCode string + AiCenterName string + IsExclusive bool + ExclusiveOrg string + CreatedTime timeutil.TimeStamp `xorm:"created"` + UpdatedTime timeutil.TimeStamp `xorm:"updated"` +} + +func (s CloudbrainSpec) ConvertToSpecification() *Specification { + return &Specification{ + ID: s.SpecId, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } +} + +func NewCloudBrainSpec(cloudbrainId int64, s Specification) CloudbrainSpec { + return CloudbrainSpec{ + CloudbrainID: cloudbrainId, + SpecId: s.ID, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } +} + +func InsertCloudbrainSpec(c CloudbrainSpec) (int64, error) { + return x.Insert(&c) +} + +func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) { + r := &CloudbrainSpec{} + if has, err := x.Where("cloudbrain_id = ?", cloudbrainId).Get(r); err != nil { + return nil, err + } else if !has { + return nil, nil + } + return r, nil +} + +func FindCloudbrainTask(page, pageSize int) ([]*Cloudbrain, error) { + r := make([]*Cloudbrain, 0) + err := x.Unscoped(). + Limit(pageSize, (page-1)*pageSize). + OrderBy("cloudbrain.id"). + Find(&r) + if err != nil { + return nil, err + } + return r, nil +} + +func CountNoSpecHistoricTask() (int64, error) { + n, err := x.Unscoped(). + Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). + Count(&Cloudbrain{}) + if err != nil { + return 0, err + } + return n, nil +} diff --git a/models/dataset.go b/models/dataset.go index 4cff4d6d1..720850ed9 100755 --- a/models/dataset.go +++ b/models/dataset.go @@ -131,13 +131,17 @@ func (datasets DatasetList) loadAttachmentAttributes(opts *SearchDatasetOptions) permission = false datasets[i].Repo.GetOwner() if !permission { - isCollaborator, _ := datasets[i].Repo.IsCollaborator(opts.User.ID) - isInRepoTeam,_:=datasets[i].Repo.IsInRepoTeam(opts.User.ID) - - if isCollaborator ||isInRepoTeam { - log.Info("Collaborator user may visit the attach.") + if datasets[i].Repo.OwnerID==opts.User.ID{ permission = true + }else{ + isCollaborator, _ := datasets[i].Repo.IsCollaborator(opts.User.ID) + isInRepoTeam,_:=datasets[i].Repo.IsInRepoTeam(opts.User.ID) + + if isCollaborator ||isInRepoTeam { + permission = true + } } + } permissionMap[datasets[i].ID] = permission diff --git a/models/models.go b/models/models.go index af0f7ac79..0bd3f8a6c 100755 --- a/models/models.go +++ b/models/models.go @@ -150,6 +150,7 @@ func init() { new(ResourceScene), new(ResourceSceneSpec), new(AdminOperateLog), + new(CloudbrainSpec), new(CloudbrainTemp), new(DatasetReference), ) diff --git a/models/resource_queue.go b/models/resource_queue.go index ff78fcc40..fc0dd8cb5 100644 --- a/models/resource_queue.go +++ b/models/resource_queue.go @@ -71,6 +71,8 @@ func (r ResourceQueueReq) ToDTO() ResourceQueue { q.AiCenterName = "云脑一" } else if r.AiCenterCode == AICenterOfCloudBrainTwo { q.AiCenterName = "云脑二" + } else if r.AiCenterCode == AICenterOfChengdu { + q.AiCenterName = "启智成都智算" } } return q diff --git a/models/resource_specification.go b/models/resource_specification.go index dca6647ab..2da8d015d 100644 --- a/models/resource_specification.go +++ b/models/resource_specification.go @@ -2,6 +2,7 @@ package models import ( "code.gitea.io/gitea/modules/timeutil" + "fmt" "xorm.io/builder" ) @@ -22,6 +23,7 @@ type ResourceSpecification struct { ShareMemGiB float32 UnitPrice int Status int + IsAvailable bool IsAutomaticSync bool CreatedTime timeutil.TimeStamp `xorm:"created"` CreatedBy int64 @@ -40,6 +42,7 @@ func (r ResourceSpecification) ConvertToRes() *ResourceSpecificationRes { GPUMemGiB: r.GPUMemGiB, UnitPrice: r.UnitPrice, Status: r.Status, + IsAvailable: r.IsAvailable, UpdatedTime: r.UpdatedTime, } } @@ -72,14 +75,16 @@ func (r ResourceSpecificationReq) ToDTO() ResourceSpecification { IsAutomaticSync: r.IsAutomaticSync, CreatedBy: r.CreatorId, UpdatedBy: r.CreatorId, + IsAvailable: true, } } type SearchResourceSpecificationOptions struct { ListOptions - QueueId int64 - Status int - Cluster string + QueueId int64 + Status int + Cluster string + AvailableCode int } type SearchResourceBriefSpecificationOptions struct { @@ -113,6 +118,7 @@ type ResourceSpecificationRes struct { ShareMemGiB float32 UnitPrice int Status int + IsAvailable bool UpdatedTime timeutil.TimeStamp } @@ -141,6 +147,53 @@ func (r ResourceSpecAndQueue) ConvertToRes() *ResourceSpecAndQueueRes { } } +type FindSpecsOptions struct { + JobType JobType + ComputeResource string + Cluster string + AiCenterCode string + SpecId int64 + QueueCode string + SourceSpecId string + AccCardsNum int + UseAccCardsNum bool + AccCardType string + CpuCores int + UseCpuCores bool + MemGiB float32 + UseMemGiB bool + GPUMemGiB float32 + UseGPUMemGiB bool + ShareMemGiB float32 + UseShareMemGiB bool + //if true,find specs no matter used or not used in scene. if false,only find specs used in scene + RequestAll bool +} + +type Specification struct { + ID int64 + SourceSpecId string + AccCardsNum int + AccCardType string + CpuCores int + MemGiB float32 + GPUMemGiB float32 + ShareMemGiB float32 + ComputeResource string + UnitPrice int + QueueId int64 + QueueCode string + Cluster string + AiCenterCode string + AiCenterName string + IsExclusive bool + ExclusiveOrg string +} + +func (Specification) TableName() string { + return "resource_specification" +} + func InsertResourceSpecification(r ResourceSpecification) (int64, error) { return x.Insert(&r) } @@ -167,6 +220,11 @@ func SearchResourceSpecification(opts SearchResourceSpecificationOptions) (int64 if opts.Cluster != "" { cond = cond.And(builder.Eq{"resource_queue.cluster": opts.Cluster}) } + if opts.AvailableCode == 1 { + cond = cond.And(builder.Eq{"resource_specification.is_available": true}) + } else if opts.AvailableCode == 2 { + cond = cond.And(builder.Eq{"resource_specification.is_available": false}) + } //cond = cond.And(builder.Or(builder.Eq{"resource_queue.deleted_time": 0}).Or(builder.IsNull{"resource_queue.deleted_time"})) n, err := x.Where(cond).Join("INNER", "resource_queue", "resource_queue.ID = resource_specification.queue_id"). Unscoped().Count(&ResourceSpecAndQueue{}) @@ -256,7 +314,7 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS return err } if len(deleteIds) > 0 { - if _, err = sess.In("id", deleteIds).Update(&ResourceSpecification{Status: SpecOffShelf}); err != nil { + if _, err = sess.Cols("status", "is_available").In("id", deleteIds).Update(&ResourceSpecification{Status: SpecOffShelf, IsAvailable: false}); err != nil { return err } if _, err = sess.In("spec_id", deleteIds).Delete(&ResourceSceneSpec{}); err != nil { @@ -267,7 +325,7 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS //update exists specs if len(updateList) > 0 { for _, v := range updateList { - if _, err = sess.ID(v.ID).Update(&v); err != nil { + if _, err = sess.ID(v.ID).UseBool("is_available").Update(&v); err != nil { return err } } @@ -283,3 +341,221 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS return sess.Commit() } + +//FindSpecs +func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) { + var cond = builder.NewCond() + if !opts.RequestAll && opts.JobType != "" { + cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType}) + } + if opts.ComputeResource != "" { + cond = cond.And(builder.Eq{"resource_queue.compute_resource": opts.ComputeResource}) + } + if opts.Cluster != "" { + cond = cond.And(builder.Eq{"resource_queue.cluster": opts.Cluster}) + } + if opts.AiCenterCode != "" { + cond = cond.And(builder.Eq{"resource_queue.ai_center_code": opts.AiCenterCode}) + } + if opts.SpecId > 0 { + cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId}) + } + if opts.QueueCode != "" { + cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode}) + } + if opts.SourceSpecId != "" { + cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId}) + } + if opts.UseAccCardsNum { + cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum}) + } + if opts.AccCardType != "" { + cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType}) + } + if opts.UseCpuCores { + cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores}) + } + if opts.UseMemGiB { + cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB}) + } + if opts.UseGPUMemGiB { + cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB}) + } + if opts.UseShareMemGiB { + cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB}) + } + r := make([]*Specification, 0) + s := x.Where(cond). + Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id") + + if !opts.RequestAll { + s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). + Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id") + } + err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc,resource_specification.cpu_cores asc,resource_specification.mem_gi_b asc,resource_specification.share_mem_gi_b asc"). + Unscoped().Find(&r) + if err != nil { + return nil, err + } + return r, nil +} + +func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) { + sess := x.NewSession() + defer sess.Close() + + sess.Begin() + param := ResourceQueue{ + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + ComputeResource: queue.ComputeResource, + AccCardType: queue.AccCardType, + } + _, err := sess.Get(¶m) + if err != nil { + sess.Rollback() + return nil, err + } + if param.ID == 0 { + _, err = sess.InsertOne(&queue) + if err != nil { + sess.Rollback() + return nil, err + } + } else { + queue = param + } + + spec.QueueId = queue.ID + _, err = sess.InsertOne(&spec) + if err != nil { + sess.Rollback() + return nil, err + } + sess.Commit() + return BuildSpecification(queue, spec), nil +} + +func BuildSpecification(queue ResourceQueue, spec ResourceSpecification) *Specification { + return &Specification{ + ID: spec.ID, + SourceSpecId: spec.SourceSpecId, + AccCardsNum: spec.AccCardsNum, + AccCardType: queue.AccCardType, + CpuCores: spec.CpuCores, + MemGiB: spec.MemGiB, + GPUMemGiB: spec.GPUMemGiB, + ShareMemGiB: spec.ShareMemGiB, + ComputeResource: queue.ComputeResource, + UnitPrice: spec.UnitPrice, + QueueId: queue.ID, + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + AiCenterName: queue.AiCenterName, + } +} + +func GetCloudbrainOneAccCardType(queueCode string) string { + switch queueCode { + case "a100": + return "A100" + case "openidebug": + return "T4" + case "openidgx": + return "V100" + + } + return "" +} + +var cloudbrainTwoSpecsInitFlag = false +var cloudbrainTwoSpecs map[string]*Specification + +func GetCloudbrainTwoSpecs() (map[string]*Specification, error) { + if !cloudbrainTwoSpecsInitFlag { + r, err := InitCloudbrainTwoSpecs() + if err != nil { + return nil, err + } + cloudbrainTwoSpecsInitFlag = true + cloudbrainTwoSpecs = r + } + return cloudbrainTwoSpecs, nil +} + +func InitCloudbrainTwoSpecs() (map[string]*Specification, error) { + r := make(map[string]*Specification, 0) + + queue, err := GetResourceQueue(&ResourceQueue{QueueCode: "openisupport"}) + if err != nil { + return nil, err + } + if queue == nil { + queue = &ResourceQueue{ + QueueCode: "openisupport", + Cluster: OpenICluster, + AiCenterCode: AICenterOfCloudBrainTwo, + AiCenterName: "云脑二", + ComputeResource: NPU, + AccCardType: "ASCEND910", + Remark: "处理历史云脑任务时自动生成", + } + _, err = x.InsertOne(queue) + if err != nil { + return nil, err + } + } + for i := 1; i <= 8; i = i * 2 { + sourceSpecId := "modelarts.bm.910.arm.public." + fmt.Sprint(i) + spec, err := GetResourceSpecification(&ResourceSpecification{ + SourceSpecId: sourceSpecId, + QueueId: queue.ID, + }) + if err != nil { + return nil, err + } + if spec == nil { + spec = &ResourceSpecification{ + QueueId: queue.ID, + SourceSpecId: sourceSpecId, + AccCardsNum: i, + CpuCores: i * 24, + MemGiB: float32(i * 256), + GPUMemGiB: float32(32), + Status: SpecOffShelf, + IsAvailable: true, + } + _, err = x.Insert(spec) + if err != nil { + return nil, err + } + } + r[sourceSpecId] = BuildSpecification(*queue, *spec) + } + return r, nil +} + +var grampusSpecsInitFlag = false +var grampusSpecs map[string]*Specification + +func GetGrampusSpecs() (map[string]*Specification, error) { + if !grampusSpecsInitFlag { + specMap := make(map[string]*Specification, 0) + r, err := FindSpecs(FindSpecsOptions{ + Cluster: C2NetCluster, + RequestAll: true, + }) + if err != nil { + return nil, err + } + for _, spec := range r { + specMap[spec.SourceSpecId] = spec + specMap[spec.SourceSpecId+"_"+spec.AiCenterCode] = spec + } + grampusSpecsInitFlag = true + grampusSpecs = specMap + } + return grampusSpecs, nil +} diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 39685990d..5bd294f2a 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -24,6 +24,7 @@ type CreateCloudBrainForm struct { Params string `form:"run_para_list"` BranchName string `form:"branch_name"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } type CommitImageCloudBrainForm struct { @@ -72,6 +73,7 @@ type CreateCloudBrainInferencForm struct { CkptName string `form:"ckpt_name" binding:"Required"` LabelName string `form:"label_names" binding:"Required"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } func (f *CreateCloudBrainForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 0338d2ae7..21008ea09 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -11,15 +11,14 @@ type CreateGrampusTrainJobForm struct { Attachment string `form:"attachment" binding:"Required"` BootFile string `form:"boot_file" binding:"Required"` ImageID string `form:"image_id" binding:"Required"` - FlavorID string `form:"flavor" binding:"Required"` Params string `form:"run_para_list" binding:"Required"` Description string `form:"description"` BranchName string `form:"branch_name" binding:"Required"` - FlavorName string `form:"flavor_name" binding:"Required"` EngineName string `form:"engine_name" binding:"Required"` WorkServerNumber int `form:"work_server_number" binding:"Required"` Image string `form:"image"` DatasetName string `form:"dataset_name"` + SpecId int64 `form:"spec_id"` } func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index ce41f5d1e..23e1f325a 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct { Description string `form:"description"` Flavor string `form:"flavor" binding:"Required"` ImageId string `form:"image_id" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { @@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct { VersionName string `form:"version_name" binding:"Required"` FlavorName string `form:"flaver_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } type CreateModelArtsInferenceJobForm struct { @@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct { ModelName string `form:"model_name" binding:"Required"` ModelVersion string `form:"model_version" binding:"Required"` CkptName string `form:"ckpt_name" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index faafbabe1..748af4a29 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -61,7 +61,6 @@ type GenerateCloudBrainTaskReq struct { Snn4ImageNetPath string BrainScorePath string JobType string - GpuQueue string Description string BranchName string BootFile string @@ -72,13 +71,13 @@ type GenerateCloudBrainTaskReq struct { DatasetInfos map[string]models.DatasetInfo BenchmarkTypeID int BenchmarkChildTypeID int - ResourceSpecId int ResultPath string TrainUrl string ModelName string ModelVersion string CkptName string LabelName string + Spec *models.Specification } func GetCloudbrainDebugCommand() string { @@ -227,50 +226,9 @@ func AdminOrImageCreaterRight(ctx *context.Context) { } func GenerateTask(req GenerateCloudBrainTaskReq) error { - var resourceSpec *models.ResourceSpec var versionCount int if req.JobType == string(models.JobTypeTrain) { versionCount = 1 - if TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) - } - for _, spec := range TrainResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - } else if req.JobType == string(models.JobTypeInference) { - if InferenceResourceSpecs == nil { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &InferenceResourceSpecs) - } - for _, spec := range InferenceResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - - } else { - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if req.ResourceSpecId == spec.Id { - resourceSpec = spec - break - } - } - - } - //如果没有匹配到spec信息,尝试从专属资源池获取 - if resourceSpec == nil && SpecialPools != nil { - resourceSpec = geMatchResourceSpec(req.JobType, req.GpuQueue, req.ResourceSpecId) - } - - if resourceSpec == nil { - log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) - return errors.New("no such resourceSpec") } volumes := []models.Volume{ @@ -342,7 +300,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { jobResult, err := CreateJob(req.JobName, models.CreateJobParams{ JobName: req.JobName, RetryCount: 1, - GpuType: req.GpuQueue, + GpuType: req.Spec.QueueCode, Image: req.Image, TaskRoles: []models.TaskRole{ { @@ -350,10 +308,10 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { TaskNumber: 1, MinSucceededTaskCount: 1, MinFailedTaskCount: 1, - CPUNumber: resourceSpec.CpuNum, - GPUNumber: resourceSpec.GpuNum, - MemoryMB: resourceSpec.MemMiB, - ShmMB: resourceSpec.ShareMemMiB, + CPUNumber: req.Spec.CpuCores, + GPUNumber: req.Spec.AccCardsNum, + MemoryMB: int(req.Spec.MemGiB * 1024), + ShmMB: int(req.Spec.ShareMemGiB * 1024), Command: req.Command, NeedIBDevice: false, IsMainRole: false, @@ -384,8 +342,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { Type: models.TypeCloudBrainOne, Uuid: req.Uuids, Image: req.Image, - GpuQueue: req.GpuQueue, - ResourceSpecId: req.ResourceSpecId, + GpuQueue: req.Spec.QueueCode, ComputeResource: models.GPUResource, BenchmarkTypeID: req.BenchmarkTypeID, BenchmarkChildTypeID: req.BenchmarkChildTypeID, @@ -405,6 +362,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { CreatedUnix: createTime, UpdatedUnix: createTime, CommitID: req.CommitID, + Spec: req.Spec, }) if err != nil { @@ -416,6 +374,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { log.Error("GetCloudbrainByJobID failed: %v", err.Error()) return err } + stringId := strconv.FormatInt(task.ID, 10) if IsBenchmarkJob(req.JobType) { @@ -447,25 +406,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { jobName := task.JobName - var resourceSpec *models.ResourceSpec - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if task.ResourceSpecId == spec.Id { - resourceSpec = spec - } - } - - //如果没有匹配到spec信息,尝试从专属资源池获取 - if resourceSpec == nil && SpecialPools != nil { - resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) - } - - if resourceSpec == nil { - log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) - return errors.New("no such resourceSpec") - } + spec := task.Spec var datasetInfos map[string]models.DatasetInfo if task.Uuid != "" { var err error @@ -547,10 +488,10 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e TaskNumber: 1, MinSucceededTaskCount: 1, MinFailedTaskCount: 1, - CPUNumber: resourceSpec.CpuNum, - GPUNumber: resourceSpec.GpuNum, - MemoryMB: resourceSpec.MemMiB, - ShmMB: resourceSpec.ShareMemMiB, + CPUNumber: spec.CpuCores, + GPUNumber: spec.AccCardsNum, + MemoryMB: int(spec.MemGiB * 1024), + ShmMB: int(spec.ShareMemGiB * 1024), Command: GetCloudbrainDebugCommand(), //Command, NeedIBDevice: false, IsMainRole: false, @@ -588,6 +529,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e CreatedUnix: createTime, UpdatedUnix: createTime, BranchName: task.BranchName, + Spec: spec, } err = models.RestartCloudbrain(task, newTask) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index ea4f7afbb..687fb4959 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -37,11 +37,10 @@ var ( ) type GenerateTrainJobReq struct { - JobName string - Command string - ResourceSpecId string - ImageUrl string //与image_id二选一,都有的情况下优先image_url - ImageId string + JobName string + Command string + ImageUrl string //与image_id二选一,都有的情况下优先image_url + ImageId string DisplayJobName string Uuid string @@ -58,7 +57,6 @@ type GenerateTrainJobReq struct { BranchName string PreVersionId int64 PreVersionName string - FlavorName string VersionCount int EngineName string TotalVersionCount int @@ -66,6 +64,7 @@ type GenerateTrainJobReq struct { ProcessType string DatasetName string Params string + Spec *models.Specification } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { @@ -79,7 +78,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error { Name: req.JobName, Command: req.Command, - ResourceSpecId: req.ResourceSpecId, + ResourceSpecId: req.Spec.SourceSpecId, ImageId: req.ImageId, ImageUrl: req.ImageUrl, CenterID: centerID, @@ -114,15 +113,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error Parameters: req.Params, BootFile: req.BootFile, DataUrl: req.DataUrl, - FlavorCode: req.ResourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, - FlavorName: req.FlavorName, EngineName: req.EngineName, VersionCount: req.VersionCount, TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if err != nil { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 7cf0861fd..4539699ad 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -84,7 +84,6 @@ type GenerateTrainJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -96,6 +95,7 @@ type GenerateTrainJobReq struct { BranchName string PreVersionId int64 PreVersionName string + FlavorCode string FlavorName string VersionCount int EngineName string @@ -103,6 +103,7 @@ type GenerateTrainJobReq struct { UserImageUrl string UserCommand string DatasetName string + Spec *models.Specification } type GenerateInferenceJobReq struct { @@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct { ModelVersion string CkptName string ResultUrl string + Spec *models.Specification DatasetName string } @@ -265,7 +266,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin return nil } -func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { +func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } @@ -279,7 +280,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc jobResult, err := createNotebook2(models.CreateNotebook2Params{ JobName: jobName, Description: description, - Flavor: flavor, + Flavor: spec.SourceSpecId, Duration: autoStopDurationMs, ImageID: imageId, PoolID: poolInfos.PoolInfo[0].PoolId, @@ -316,7 +317,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, - FlavorCode: flavor, + FlavorCode: spec.SourceSpecId, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, @@ -326,6 +327,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc Description: description, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: spec, } err = models.CreateCloudbrain(task) @@ -356,7 +358,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, UserImageUrl: req.UserImageUrl, @@ -378,7 +380,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -427,7 +429,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -436,6 +438,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { @@ -487,7 +490,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -508,7 +511,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -575,7 +578,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job DataUrl: req.DataUrl, LogUrl: req.LogUrl, PreVersionId: req.PreVersionId, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -584,6 +587,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job VersionCount: VersionListCount + 1, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) @@ -674,7 +678,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -726,7 +730,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -742,6 +746,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e ResultUrl: req.ResultUrl, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if err != nil { diff --git a/modules/modelarts_cd/modelarts.go b/modules/modelarts_cd/modelarts.go index 25324dbd5..330b048ca 100755 --- a/modules/modelarts_cd/modelarts.go +++ b/modules/modelarts_cd/modelarts.go @@ -88,7 +88,7 @@ type Parameters struct { } `json:"parameter"` } -func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { +func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { imageName, err := GetNotebookImageName(imageId) if err != nil { log.Error("GetNotebookImageName failed: %v", err.Error()) @@ -98,7 +98,7 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr jobResult, err := createNotebook(models.CreateNotebookWithoutPoolParams{ JobName: jobName, Description: description, - Flavor: flavor, + Flavor: spec.SourceSpecId, Duration: autoStopDurationMs, ImageID: imageId, Feature: models.NotebookFeature, @@ -134,7 +134,7 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, - FlavorCode: flavor, + FlavorCode: spec.SourceSpecId, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), Type: models.TypeCDCenter, @@ -144,6 +144,7 @@ func GenerateNotebook(ctx *context.Context, displayJobName, jobName, uuid, descr Description: description, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: spec, } err = models.CreateCloudbrain(task) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 26a1538d2..430803456 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -3208,6 +3208,9 @@ gpu_num = GPU cpu_num = CPU memory = Memory shared_memory = Shared Memory +gpu_memory = GPU Memory +free = Free +point_hr = Point/hr DEBUG = DEBUG diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 636697cb1..f2f15ca55 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -3227,6 +3227,9 @@ gpu_num = GPU数 cpu_num = CPU数 memory = 内存 shared_memory = 共享内存 +gpu_memory = 显存 +free = 免费 +point_hr = 积分/时 DEBUG = 调试任务 SNN4IMAGENET = 评测任务 diff --git a/routers/admin/cloudbrains.go b/routers/admin/cloudbrains.go index ec0034f4f..fcb878627 100755 --- a/routers/admin/cloudbrains.go +++ b/routers/admin/cloudbrains.go @@ -92,13 +92,13 @@ func CloudBrains(ctx *context.Context) { return } + models.LoadSpecs4CloudbrainInfo(ciTasks) + for i, task := range ciTasks { ciTasks[i].CanDebug = true ciTasks[i].CanDel = true ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource ciTasks[i].Cloudbrain.AiCenter = repo.GetCloudbrainAiCenter(task.Cloudbrain, ctx) - _, cardType, _ := repo.GetCloudbrainCardNumAndType(task.Cloudbrain) - ciTasks[i].Cloudbrain.CardType = cardType ciTasks[i].Cloudbrain.Cluster = repo.GetCloudbrainCluster(task.Cloudbrain, ctx) } diff --git a/routers/admin/resources.go b/routers/admin/resources.go index 7d267c19c..8a8c55f86 100644 --- a/routers/admin/resources.go +++ b/routers/admin/resources.go @@ -8,6 +8,8 @@ import ( "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/cloudbrain/resource" "net/http" + "strconv" + "strings" ) const ( @@ -118,11 +120,13 @@ func GetResourceSpecificationList(ctx *context.Context) { queue := ctx.QueryInt64("queue") status := ctx.QueryInt("status") cluster := ctx.Query("cluster") + available := ctx.QueryInt("available") list, err := resource.GetResourceSpecificationList(models.SearchResourceSpecificationOptions{ - ListOptions: models.ListOptions{Page: page, PageSize: 10}, - QueueId: queue, - Status: status, - Cluster: cluster, + ListOptions: models.ListOptions{Page: page, PageSize: 10}, + QueueId: queue, + Status: status, + Cluster: cluster, + AvailableCode: available, }) if err != nil { log.Error("GetResourceSpecificationList error.%v", err) @@ -246,3 +250,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) { } ctx.JSON(http.StatusOK, response.Success()) } + +func RefreshHistorySpec(ctx *context.Context) { + scope := ctx.Query("scope") + list := ctx.Query("list") + + var scopeAll = false + if scope == "all" { + scopeAll = true + } + var ids = make([]int64, 0) + if list != "" { + strs := strings.Split(list, "|") + for _, s := range strs { + i, err := strconv.ParseInt(s, 10, 64) + if err != nil { + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + ids = append(ids, i) + } + + } + + total, success, err := resource.RefreshHistorySpec(scopeAll, ids) + if err != nil { + log.Error("RefreshHistorySpec error. %v", err) + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + r := make(map[string]interface{}, 0) + r["success"] = success + r["total"] = total + ctx.JSON(http.StatusOK, response.SuccessWithData(r)) +} diff --git a/routers/private/internal.go b/routers/private/internal.go index 4731463b1..3e2eeab31 100755 --- a/routers/private/internal.go +++ b/routers/private/internal.go @@ -6,6 +6,7 @@ package private import ( + "code.gitea.io/gitea/routers/admin" "strings" "code.gitea.io/gitea/routers/repo" @@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/tool/org_stat", OrgStatisticManually) m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) + m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec) }, CheckInternalToken) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 02a1f6a2a..953d24bb2 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2,6 +2,7 @@ package repo import ( "bufio" + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -123,86 +124,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["QueuesDetail"] = queuesDetail } - cloudbrain.InitSpecialPool() - - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - ctx.Data["gpu_types"] = gpuInfos.GpuInfo - - if trainGpuInfos == nil { - json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) - } - ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo - - if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" { - json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) - } - if inferenceGpuInfos != nil { - ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo - } - - if benchmarkGpuInfos == nil { - json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) - } - ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo - - if benchmarkResourceSpecs == nil { - json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) - } - ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec - - if cloudbrain.ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) - } - ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec - - if cloudbrain.TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) - } - ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec - - if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) - } - if cloudbrain.InferenceResourceSpecs != nil { - ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec - } - - if cloudbrain.SpecialPools != nil { - var debugGpuTypes []*models.GpuInfo - var trainGpuTypes []*models.GpuInfo - - for _, pool := range cloudbrain.SpecialPools.Pools { - isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID) - if isOrgMember { - for _, jobType := range pool.JobType { - if jobType == string(models.JobTypeDebug) { - debugGpuTypes = append(debugGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["resource_specs"] = pool.ResourceSpec - } - } else if jobType == string(models.JobTypeTrain) { - trainGpuTypes = append(trainGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["train_resource_specs"] = pool.ResourceSpec - } - } - } - break - } - - } - - if len(debugGpuTypes) > 0 { - ctx.Data["gpu_types"] = debugGpuTypes - } - - if len(trainGpuTypes) > 0 { - ctx.Data["train_gpu_types"] = trainGpuTypes - } - - } + prepareCloudbrainOneSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -220,6 +142,40 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainOneSpecs(ctx *context.Context) { + debugSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["debug_specs"] = debugSpecs + + trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["train_specs"] = trainSpecs + + inferenceSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["inference_specs"] = inferenceSpecs + + benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["benchmark_specs"] = benchmarkSpecs +} + func CloudBrainNew(ctx *context.Context) { err := cloudBrainNewDataPrepare(ctx) if err != nil { @@ -237,9 +193,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { image := strings.TrimSpace(form.Image) uuids := form.Attachment jobType := form.JobType - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := form.BranchName bootFile := strings.TrimSpace(form.BootFile) repo := ctx.Repo.Repository @@ -337,18 +291,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command = commandTrain } - errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) - - if errStr != "" { - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(errStr, tpl, &form) - return - } - if branchName == "" { branchName = cloudbrain.DefaultBranchName } - errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) + errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) if errStr != "" { cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form) @@ -357,6 +303,17 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobType(jobType), + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } + req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -372,7 +329,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, @@ -380,8 +336,8 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { CommitID: commitID, BenchmarkTypeID: 0, BenchmarkChildTypeID: 0, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -428,9 +384,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra image := strings.TrimSpace(form.Image) uuid := form.Attachment jobType := string(models.JobTypeInference) - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := form.BranchName bootFile := strings.TrimSpace(form.BootFile) labelName := form.LabelName @@ -522,7 +476,16 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } - + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } req := cloudbrain.GenerateCloudBrainTaskReq{ Ctx: ctx, DisplayJobName: displayJobName, @@ -538,19 +501,18 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, Params: form.Params, CommitID: commitID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), ModelName: form.ModelName, ModelVersion: form.ModelVersion, CkptName: form.CkptName, TrainUrl: form.TrainUrl, LabelName: labelName, + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -628,34 +590,25 @@ func CloudBrainRestart(ctx *context.Context) { break } - var hasSameResource bool - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - for _, resourceType := range gpuInfos.GpuInfo { - if resourceType.Queue == task.GpuQueue { - hasSameResource = true - break - } - } - if !hasSameResource && cloudbrain.SpecialPools != nil { - - for _, specialPool := range cloudbrain.SpecialPools.Pools { - cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) - for _, pool := range specialPool.Pool { - if pool.Queue == task.GpuQueue { - hasSameResource = true - } - } - } + specOld, err := resource.GetCloudbrainSpec(task.ID) + if err != nil || specOld == nil { + log.Error("CloudBrainRestart GetCloudbrainSpec error.task.id = %d", task.ID) + resultCode = "-1" + errorMsg = "Resource specification not support any more" + break } - - if !hasSameResource { - log.Error("has no same resource, can not restart", ctx.Data["MsgID"]) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, specOld.ID, models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + log.Error("CloudBrainRestart GetAndCheckSpec error.task.id = %d", task.ID) resultCode = "-1" - errorMsg = "the job's version is too old and can not be restarted" + errorMsg = "Resource specification not support any more" break } + task.Spec = spec count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) if err != nil { @@ -728,128 +681,13 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo ctx.NotFound(ctx.Req.URL.RequestURI(), nil) return } - hasSpec := false - if task.JobType == string(models.JobTypeTrain) { - if cloudbrain.TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) - } - - for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - } - } - - } else if task.JobType == string(models.JobTypeInference) { - if cloudbrain.InferenceResourceSpecs == nil { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) - } - for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - } - } - } else { - if cloudbrain.ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) - } - for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { - if tmp.Id == task.ResourceSpecId { - hasSpec = true - ctx.Data["GpuNum"] = tmp.GpuNum - ctx.Data["CpuNum"] = tmp.CpuNum - ctx.Data["MemMiB"] = tmp.MemMiB - ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB - break - - } - } - } - - if !hasSpec && cloudbrain.SpecialPools != nil { - - for _, specialPool := range cloudbrain.SpecialPools.Pools { - - if specialPool.ResourceSpec != nil { - - for _, spec := range specialPool.ResourceSpec { - if task.ResourceSpecId == spec.Id { - ctx.Data["GpuNum"] = spec.GpuNum - ctx.Data["CpuNum"] = spec.CpuNum - ctx.Data["MemMiB"] = spec.MemMiB - ctx.Data["ShareMemMiB"] = spec.ShareMemMiB - break - } - } - } - } + prepareSpec4Show(ctx, task) + if ctx.Written() { + return } if result != nil { jobRes, _ := models.ConvertToJobResultPayload(result.Payload) - jobRes.Resource.Memory = strings.ReplaceAll(jobRes.Resource.Memory, "Mi", "MB") - spec := "GPU数:" + strconv.Itoa(jobRes.Resource.NvidiaComGpu) + ",CPU数:" + strconv.Itoa(jobRes.Resource.CPU) + ",内存(MB):" + jobRes.Resource.Memory - ctx.Data["resource_spec"] = spec - if task.JobType == string(models.JobTypeTrain) { - if trainGpuInfos == nil { - json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) - } - for _, resourceType := range trainGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - - } else if task.JobType == string(models.JobTypeInference) { - if inferenceGpuInfos == nil { - json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) - } - for _, resourceType := range inferenceGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } else if cloudbrain.IsBenchmarkJob(task.JobType) { - if benchmarkGpuInfos == nil { - json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) - } - - for _, resourceType := range benchmarkGpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - - } else { - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - for _, resourceType := range gpuInfos.GpuInfo { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } - - if cloudbrain.SpecialPools != nil { - for _, specialPool := range cloudbrain.SpecialPools.Pools { - for _, resourceType := range specialPool.Pool { - if resourceType.Queue == jobRes.Config.GpuType { - ctx.Data["resource_type"] = resourceType.Value - } - } - } - } taskRoles := jobRes.TaskRoles taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) ctx.Data["taskRes"] = taskRes @@ -973,6 +811,85 @@ func CloudBrainDebug(ctx *context.Context) { ctx.Redirect(debugUrl) } +func prepareSpec4Show(ctx *context.Context, task *models.Cloudbrain) { + s, err := resource.GetCloudbrainSpec(task.ID) + if err != nil { + log.Info("error:" + err.Error()) + ctx.NotFound(ctx.Req.URL.RequestURI(), nil) + return + } + ctx.Data["Spec"] = s +} + +func oldPrepareSpec4Show(ctx *context.Context, task *models.Cloudbrain) { + hasSpec := false + if task.JobType == string(models.JobTypeTrain) { + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + + for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + } + } + + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + } + } + } else { + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + ctx.Data["GpuNum"] = tmp.GpuNum + ctx.Data["CpuNum"] = tmp.CpuNum + ctx.Data["MemMiB"] = tmp.MemMiB + ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB + break + + } + } + } + + if !hasSpec && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + + for _, spec := range specialPool.ResourceSpec { + if task.ResourceSpecId == spec.Id { + ctx.Data["GpuNum"] = spec.GpuNum + ctx.Data["CpuNum"] = spec.CpuNum + ctx.Data["MemMiB"] = spec.MemMiB + ctx.Data["ShareMemMiB"] = spec.ShareMemMiB + break + } + } + } + } + } +} + func CloudBrainCommitImageShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true ctx.Data["Type"] = ctx.Cloudbrain.Type @@ -2306,10 +2223,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) image := strings.TrimSpace(form.Image) - gpuQueue := form.GpuType command := cloudbrain.CommandBenchmark codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := cloudbrain.BenchMarkResourceID benchmarkTypeID := form.BenchmarkTypeID benchmarkChildTypeID := form.BenchmarkChildTypeID repo := ctx.Repo.Repository @@ -2359,19 +2274,14 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo return } - _, err = getBenchmarkGpuQueue(gpuQueue) - if err != nil { - log.Error("getBenchmarkGpuQueue failed:%v", err, ctx.Data["MsgID"]) + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("gpu queue error", tplCloudBrainBenchmarkNew, &form) - return - } - - _, err = getBenchmarkResourceSpec(resourceSpecId) - if err != nil { - log.Error("getBenchmarkResourceSpec failed:%v", err, ctx.Data["MsgID"]) - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("resource spec error", tplCloudBrainBenchmarkNew, &form) + ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form) return } @@ -2432,14 +2342,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo } benchmarkPath := setting.JobPath + jobName + cloudbrain.BenchMarkMountPath - var gpuType string - for _, gpuInfo := range gpuInfos.GpuInfo { - if gpuInfo.Queue == gpuQueue { - gpuType = gpuInfo.Value - } - } - if err := downloadRateCode(repo, jobName, childInfo.Owner, childInfo.RepoName, benchmarkPath, form.BenchmarkCategory, gpuType, ctx.User.Name); err != nil { + if err := downloadRateCode(repo, jobName, childInfo.Owner, childInfo.RepoName, benchmarkPath, form.BenchmarkCategory, spec.AccCardType, ctx.User.Name); err != nil { log.Error("downloadRateCode failed, %v", err, ctx.Data["MsgID"]) //cloudBrainNewDataPrepare(ctx) //ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) @@ -2478,7 +2382,6 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: string(models.JobTypeBenchmark), - GpuQueue: gpuQueue, Description: form.Description, BranchName: cloudbrain.DefaultBranchName, BootFile: "", @@ -2486,8 +2389,8 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo CommitID: "", BenchmarkTypeID: benchmarkTypeID, BenchmarkChildTypeID: benchmarkChildTypeID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) @@ -2507,9 +2410,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) image := form.Image uuid := form.Attachment jobType := form.JobType - gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath - resourceSpecId := form.ResourceSpecId branchName := cloudbrain.DefaultBranchName repo := ctx.Repo.Repository @@ -2601,6 +2502,16 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form) return } + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne}) + if err != nil || spec == nil { + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("Resource specification not available", tpl, &form) + return + } log.Info("Command=" + command) log.Info("ModelPath=" + storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/")) req := cloudbrain.GenerateCloudBrainTaskReq{ @@ -2618,7 +2529,6 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), JobType: jobType, - GpuQueue: gpuQueue, Description: form.Description, BranchName: branchName, BootFile: form.BootFile, @@ -2626,8 +2536,8 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) CommitID: "", BenchmarkTypeID: 0, BenchmarkChildTypeID: benchmarkChildTypeID, - ResourceSpecId: resourceSpecId, ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"), + Spec: spec, } err = cloudbrain.GenerateTask(req) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 05a89cc49..939ba7e99 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1,6 +1,7 @@ package repo import ( + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -108,15 +109,11 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err } } - //get valid resource specs - specs, err := grampus.GetResourceSpecs(processType) - - grampusSpecs := getFilterSpecBySpecialPool(specs, includeCenters, excludeCenters) - - if err != nil { - log.Error("GetResourceSpecs failed:", err.Error()) - } else { - ctx.Data["flavor_infos"] = grampusSpecs + //prepare available specs + if processType == grampus.ProcessorTypeNPU { + prepareGrampusTrainSpecs(ctx, models.NPU) + } else if processType == grampus.ProcessorTypeGPU { + prepareGrampusTrainSpecs(ctx, models.GPU) } //get branches @@ -142,6 +139,15 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err return nil } +func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: computeResource, + Cluster: models.C2NetCluster, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { if len(includeCenters) == 0 && len(excludeCenters) == 0 { return specs.Infos @@ -208,7 +214,6 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid branchName := form.BranchName - flavorName := form.FlavorName image := strings.TrimSpace(form.Image) lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName)) @@ -284,6 +289,18 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check specification + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.GPU, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) + ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobGPUNew, &form) + return + } + //check dataset attachment, err := models.GetAttachmentByUUID(uuid) if err != nil { @@ -348,7 +365,6 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ComputeResource: models.GPUResource, ProcessType: grampus.ProcessorTypeGPU, Command: command, - ResourceSpecId: form.FlavorID, ImageUrl: image, Description: description, BootFile: bootFile, @@ -356,12 +372,12 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain CommitID: commitID, BranchName: branchName, Params: form.Params, - FlavorName: flavorName, EngineName: image, DatasetName: attachment.Name, IsLatestVersion: modelarts.IsLatestVersion, VersionCount: modelarts.VersionCountOne, WorkServerNumber: 1, + Spec: spec, } err = grampus.GenerateTrainJob(ctx, req) @@ -409,7 +425,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion - flavorName := form.FlavorName versionCount := modelarts.VersionCountOne engineName := form.EngineName @@ -486,6 +501,18 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check specification + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.C2NetCluster, + }) + if err != nil || spec == nil { + grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) + ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobNPUNew, &form) + return + } + //check dataset attachment, err := models.GetAttachmentByUUID(uuid) if err != nil { @@ -540,7 +567,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ComputeResource: models.NPUResource, ProcessType: grampus.ProcessorTypeNPU, Command: command, - ResourceSpecId: form.FlavorID, ImageId: form.ImageID, DataUrl: dataObsPath, Description: description, @@ -553,11 +579,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain IsLatestVersion: isLatestVersion, BranchName: branchName, Params: form.Params, - FlavorName: flavorName, EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetName: attachment.Name, + Spec: spec, } err = grampus.GenerateTrainJob(ctx, req) @@ -734,6 +760,7 @@ func GrampusTrainJobShow(ctx *context.Context) { taskList := make([]*models.Cloudbrain, 0) taskList = append(taskList, task) + prepareSpec4Show(ctx, task) ctx.Data["version_list_task"] = taskList ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false) ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index f7f464324..769f86115 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -2,6 +2,8 @@ package repo import ( "archive/zip" + "code.gitea.io/gitea/modules/modelarts_cd" + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -133,8 +135,8 @@ func notebookNewDataPrepare(ctx *context.Context) error { } ctx.Data["attachments"] = attachs ctx.Data["images"] = setting.StImageInfos.ImageInfo - ctx.Data["flavors"] = setting.StFlavorInfo.FlavorInfo - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug)) + + prepareCloudbrainTwoDebugSpecs(ctx) ctx.Data["datasetType"] = models.TypeCloudBrainTwo @@ -144,13 +146,70 @@ func notebookNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) { + aiCenterCode := models.AICenterOfCloudBrainTwo + if setting.ModelartsCD.Enabled { + aiCenterCode = models.AICenterOfChengdu + } + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode, + }) + ctx.Data["Specs"] = noteBookSpecs +} + +func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { + ctx.Data["PageIsNotebook"] = true + jobName := form.JobName + uuid := form.Attachment + description := form.Description + flavor := form.Flavor + + count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form) + return + } + } + _, err = models.GetCloudbrainByName(jobName) + if err == nil { + log.Error("the job name did already exist", ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form) + return + } else { + if !models.IsErrJobNotExist(err) { + log.Error("system error, %v", err, ctx.Data["MsgID"]) + cloudBrainNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form) + return + } + } + + err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor) + if err != nil { + ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") +} + func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { ctx.Data["PageIsNotebook"] = true displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment description := form.Description - flavor := form.Flavor imageId := form.ImageId repo := ctx.Repo.Repository @@ -195,18 +254,24 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm return } } - - errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug)) - if errStr != "" { + var aiCenterCode = models.AICenterOfCloudBrainTwo + if setting.ModelartsCD.Enabled { + aiCenterCode = models.AICenterOfChengdu + } + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode}) + if err != nil || spec == nil { notebookNewDataPrepare(ctx) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form) return } - if setting.ModelartsCD.Enabled { - err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, imageId, spec) } else { - err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec) } if err != nil { @@ -253,21 +318,7 @@ func NotebookShow(ctx *context.Context) { if err == nil { task.User = user } - - findSpec := false - if setting.StFlavorInfo != nil { - ctx.Data["resource_spec"] = setting.StFlavorInfo.FlavorInfo[0].Desc - for _, f := range setting.StFlavorInfo.FlavorInfo { - if fmt.Sprint(f.Value) == task.FlavorCode { - ctx.Data["resource_spec"] = f.Desc - findSpec = true - break - } - } - } - - setShowSpecBySpecialPoolConfig(ctx, findSpec, task) - + prepareSpec4Show(ctx, task) if task.TrainJobDuration == "" { if task.Duration == 0 { var duration int64 @@ -375,6 +426,7 @@ func NotebookRestart(ctx *context.Context) { var resultCode = "-1" var errorMsg = "" var status = "" + var spec *models.Specification task := ctx.Cloudbrain @@ -402,6 +454,28 @@ func NotebookRestart(ctx *context.Context) { } } + oldSpec, err := resource.GetCloudbrainSpec(task.ID) + if err != nil || oldSpec == nil { + log.Error("NotebookManage GetCloudbrainSpec error.%v", err) + errorMsg = "Resource specification not available" + break + } + + aiCenterCode := models.AICenterOfCloudBrainTwo + if task.Type == models.TypeCDCenter { + aiCenterCode = models.AICenterOfChengdu + } + spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: aiCenterCode}) + if err != nil || spec == nil { + log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID) + errorMsg = "Resource specification not support any more" + break + } + createTime := timeutil.TimeStampNow() param := models.NotebookAction{ Action: models.ActionStart, @@ -451,8 +525,7 @@ func NotebookRestart(ctx *context.Context) { Description: task.Description, CreatedUnix: createTime, UpdatedUnix: createTime, - FlavorCode: task.FlavorCode, - FlavorName: task.FlavorName, + Spec: spec, } err = models.RestartCloudbrain(task, newTask) @@ -698,14 +771,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -725,6 +791,16 @@ func trainJobNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func setMultiNodeIfConfigureMatch(ctx *context.Context) { modelarts.InitMultiNode() if modelarts.MultiNodeConfig != nil { @@ -819,13 +895,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -914,14 +984,12 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + prepareCloudbrainTwoTrainSpecs(ctx) + spec, _ := resource.GetCloudbrainSpec(task.ID) + if spec != nil { + log.Info("spec_id = %d", spec.ID) + ctx.Data["spec_id"] = spec.ID } - ctx.Data["flavor_infos"] = flavorInfos.Info - - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) var Parameters modelarts.Parameters if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil { @@ -1012,13 +1080,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) var Parameters modelarts.Parameters if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { @@ -1071,7 +1133,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1134,10 +1195,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form) return } //Determine whether the task name of the task in the project is duplicated @@ -1300,7 +1365,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -1316,6 +1380,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount: VersionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -1450,7 +1515,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1498,10 +1562,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { versionErrorDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form) return } @@ -1655,7 +1723,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, IsLatestVersion: isLatestVersion, EngineID: int64(engineID), @@ -1672,6 +1739,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionName: PreVersionName, TotalVersionCount: latestTask.TotalVersionCount + 1, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -1855,7 +1923,6 @@ func TrainJobShow(ctx *context.Context) { for i, task := range VersionListTasks { var parameters models.Parameters - err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), ¶meters) if err != nil { log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err) @@ -1876,6 +1943,14 @@ func TrainJobShow(ctx *context.Context) { datasetList = append(datasetList, GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)) VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) + + //add spec + s, err := resource.GetCloudbrainSpec(task.Cloudbrain.ID) + if err != nil { + log.Error("TrainJobShow GetCloudbrainSpec error:" + err.Error()) + continue + } + VersionListTasks[i].Cloudbrain.Spec = s } pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5) @@ -2043,7 +2118,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID repo := ctx.Repo.Repository @@ -2130,13 +2204,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { inferenceJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form) return } - //todo: del the codeLocalPath _, err = ioutil.ReadDir(codeLocalPath) if err == nil { @@ -2188,7 +2265,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid) if err != nil { inferenceJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form) return } dataPath := dataUrl @@ -2244,7 +2321,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: trainUrl, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -2264,6 +2340,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ModelVersion: modelVersion, CkptName: ckptName, ResultUrl: resultObsPath, + Spec: spec, DatasetName: datasetNames, } @@ -2444,14 +2521,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) + prepareCloudbrainTwoInferenceSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -2482,6 +2552,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error { ctx.Data["PageIsCloudBrain"] = true @@ -2516,14 +2596,7 @@ func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModel return err } ctx.Data["engine_versions"] = versionInfos.Version - - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) + prepareCloudbrainTwoInferenceSpecs(ctx) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -2598,7 +2671,7 @@ func InferenceJobShow(ctx *context.Context) { } else { task.Parameters = "" } - + prepareSpec4Show(ctx, task) LabelName := strings.Fields(task.LabelName) ctx.Data["labelName"] = LabelName ctx.Data["jobID"] = jobID diff --git a/routers/response/response_list.go b/routers/response/response_list.go index 5e057bfd0..6514f3edd 100644 --- a/routers/response/response_list.go +++ b/routers/response/response_list.go @@ -2,3 +2,4 @@ package response var RESOURCE_QUEUE_NOT_AVAILABLE = &BizError{Code: 1001, Err: "resource queue not available"} var SPECIFICATION_NOT_EXIST = &BizError{Code: 1002, Err: "specification not exist"} +var SPECIFICATION_NOT_AVAILABLE = &BizError{Code: 1003, Err: "specification not available"} diff --git a/routers/user/home.go b/routers/user/home.go index d8c2565c6..78e6c00e9 100755 --- a/routers/user/home.go +++ b/routers/user/home.go @@ -836,14 +836,12 @@ func Cloudbrains(ctx *context.Context) { ctx.ServerError("Get job failed:", err) return } - + models.LoadSpecs4CloudbrainInfo(ciTasks) for i, task := range ciTasks { ciTasks[i].CanDebug = true ciTasks[i].CanDel = true ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource ciTasks[i].Cloudbrain.AiCenter = repo.GetCloudbrainAiCenter(task.Cloudbrain, ctx) - _, cardType, _ := repo.GetCloudbrainCardNumAndType(task.Cloudbrain) - ciTasks[i].Cloudbrain.CardType = cardType ciTasks[i].Cloudbrain.Cluster = repo.GetCloudbrainCluster(task.Cloudbrain, ctx) } diff --git a/services/cloudbrain/resource/resource_specification.go b/services/cloudbrain/resource/resource_specification.go index 680b98933..b68abbb88 100644 --- a/services/cloudbrain/resource/resource_specification.go +++ b/services/cloudbrain/resource/resource_specification.go @@ -2,12 +2,19 @@ package resource import ( "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/admin/operate_log" + "encoding/json" + "errors" "fmt" + "strconv" "strings" + "time" ) func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error { @@ -92,6 +99,7 @@ func SyncGrampusSpecs(doerId int64) error { GPUMemGiB: gpuMemGiB, Status: models.SpecNotVerified, IsAutomaticSync: true, + IsAvailable: true, CreatedBy: doerId, UpdatedBy: doerId, }) @@ -103,6 +111,7 @@ func SyncGrampusSpecs(doerId int64) error { CpuCores: spec.SpecInfo.CpuCoreNum, MemGiB: memGiB, GPUMemGiB: gpuMemGiB, + IsAvailable: true, UpdatedBy: doerId, }) } @@ -142,7 +151,9 @@ func ResourceSpecOnShelf(doerId int64, id int64, unitPrice int) *response.BizErr if q, err := models.GetResourceQueue(&models.ResourceQueue{ID: spec.QueueId}); err != nil || q == nil { return response.RESOURCE_QUEUE_NOT_AVAILABLE } - + if !spec.IsAvailable { + return response.SPECIFICATION_NOT_AVAILABLE + } err = models.ResourceSpecOnShelf(id, unitPrice) if err != nil { return response.NewBizError(err) @@ -184,3 +195,461 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod Comment: comment, }) } + +func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) { + r, err := models.FindSpecs(opts) + if err != nil { + log.Error("FindAvailableSpecs error.%v", err) + return nil, err + } + //filter exclusive specs + specs := filterExclusiveSpecs(r, userId) + + //distinct by sourceSpecId + specs = distinctSpecs(specs) + return specs, err +} + +func filterExclusiveSpecs(r []*models.Specification, userId int64) []*models.Specification { + specs := make([]*models.Specification, 0, len(r)) + specMap := make(map[int64]string, 0) + for i := 0; i < len(r); i++ { + spec := r[i] + if _, has := specMap[spec.ID]; has { + continue + } + if !spec.IsExclusive { + specs = append(specs, spec) + specMap[spec.ID] = "" + continue + } + orgs := strings.Split(spec.ExclusiveOrg, ";") + for _, org := range orgs { + isMember, _ := models.IsOrganizationMemberByOrgName(org, userId) + if isMember { + specs = append(specs, spec) + specMap[spec.ID] = "" + break + } + } + } + return specs +} + +func distinctSpecs(r []*models.Specification) []*models.Specification { + specs := make([]*models.Specification, 0, len(r)) + sourceSpecIdMap := make(map[string]string, 0) + for i := 0; i < len(r); i++ { + spec := r[i] + if spec.SourceSpecId == "" { + specs = append(specs, spec) + continue + } + if _, has := sourceSpecIdMap[spec.SourceSpecId]; has { + continue + } + specs = append(specs, spec) + sourceSpecIdMap[spec.SourceSpecId] = "" + } + return specs +} + +func GetAndCheckSpec(userId int64, specId int64, opts models.FindSpecsOptions) (*models.Specification, error) { + if specId == 0 { + return nil, nil + } + opts.SpecId = specId + r, err := FindAvailableSpecs(userId, opts) + if err != nil { + return nil, err + } + if r == nil || len(r) == 0 { + return nil, nil + } + return r[0], nil +} + +func InsertCloudbrainSpec(cloudbrainId int64, s *models.Specification) error { + c := models.CloudbrainSpec{ + CloudbrainID: cloudbrainId, + SpecId: s.ID, + SourceSpecId: s.SourceSpecId, + AccCardsNum: s.AccCardsNum, + AccCardType: s.AccCardType, + CpuCores: s.CpuCores, + MemGiB: s.MemGiB, + GPUMemGiB: s.GPUMemGiB, + ShareMemGiB: s.ShareMemGiB, + ComputeResource: s.ComputeResource, + UnitPrice: s.UnitPrice, + QueueId: s.QueueId, + QueueCode: s.QueueCode, + Cluster: s.Cluster, + AiCenterCode: s.AiCenterCode, + AiCenterName: s.AiCenterName, + IsExclusive: s.IsExclusive, + ExclusiveOrg: s.ExclusiveOrg, + } + _, err := models.InsertCloudbrainSpec(c) + if err != nil { + log.Error("InsertCloudbrainSpec error.CloudbrainSpec=%v. err=%v", c, err) + return err + } + return nil +} + +func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) { + c, err := models.GetCloudbrainSpecByID(cloudbrainId) + if err != nil { + return nil, err + } + if c == nil { + return nil, nil + } + return c.ConvertToSpecification(), nil +} + +func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) { + var success int64 + var total int64 + + if !scopeAll { + if ids == nil || len(ids) == 0 { + return 0, 0, nil + } + total = int64(len(ids)) + tasks, err := models.GetCloudbrainWithDeletedByIDs(ids) + if err != nil { + return total, 0, err + } + for _, task := range tasks { + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + + } else { + page := 1 + pageSize := 100 + n, err := models.CountNoSpecHistoricTask() + if err != nil { + log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err) + return 0, 0, err + } + total = n + for i := 0; i < 500; i++ { + list, err := models.FindCloudbrainTask(page, pageSize) + page++ + if err != nil { + log.Error("FindCloudbrainTask error.page=%d pageSize=%d e=%v", page, pageSize, err) + return total, success, err + } + if len(list) == 0 { + log.Info("RefreshHistorySpec. list is empty") + break + } + for _, task := range list { + s, err := GetCloudbrainSpec(task.ID) + if err != nil { + log.Error("RefreshHistorySpec GetCloudbrainSpec error.%v", err) + continue + } + if s != nil { + continue + } + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + if len(list) < pageSize { + log.Info("RefreshHistorySpec. list < pageSize") + break + } + } + } + return total, success, nil + +} + +func RefreshOneHistorySpec(task *models.Cloudbrain) error { + var spec *models.Specification + var err error + switch task.Type { + case models.TypeCloudBrainOne: + spec, err = getCloudbrainOneSpec(task) + case models.TypeCloudBrainTwo: + spec, err = getCloudbrainTwoSpec(task) + case models.TypeC2Net: + spec, err = getGrampusSpec(task) + } + if err != nil { + log.Error("find spec error,task.ID=%d err=%v", task.ID, err) + return err + } + if spec == nil { + log.Error("find spec failed,task.ID=%d", task.ID) + return errors.New("find spec failed") + } + return InsertCloudbrainSpec(task.ID, spec) +} + +func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) { + if task.GpuQueue == "" { + log.Info("gpu queue is empty.task.ID = %d", task.ID) + return nil, nil + } + //find from config + spec, err := findCloudbrainOneSpecFromConfig(task) + if err != nil { + log.Error("getCloudbrainOneSpec findCloudbrainOneSpecFromConfig error.%v", err) + return nil, err + } + if spec != nil { + return spec, nil + } + //find from remote + return findCloudbrainOneSpecFromRemote(task) + +} + +func findCloudbrainOneSpecFromRemote(task *models.Cloudbrain) (*models.Specification, error) { + time.Sleep(200 * time.Millisecond) + log.Info("start findCloudbrainOneSpecFromRemote") + result, err := cloudbrain.GetJob(task.JobID) + if err != nil { + log.Error("getCloudbrainOneSpec error. %v", err) + return nil, err + } + + if result == nil { + log.Info("findCloudbrainOneSpecFromRemote failed,result is empty.task.ID=%d", task.ID) + return nil, nil + } + jobRes, _ := models.ConvertToJobResultPayload(result.Payload) + memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory) + if task.ComputeResource == "CPU/GPU" { + task.ComputeResource = models.GPU + } + var shmMB float32 + if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 { + shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024 + if jobRes.Config.TaskRoles[0].ShmMB == 103600 { + shmMB = 100 + } else if jobRes.Config.TaskRoles[0].ShmMB == 51800 { + shmMB = 50 + } + } + opt := models.FindSpecsOptions{ + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: jobRes.Resource.NvidiaComGpu, + UseAccCardsNum: true, + CpuCores: jobRes.Resource.CPU, + UseCpuCores: true, + MemGiB: memSize, + UseMemGiB: memSize > 0, + ShareMemGiB: shmMB, + UseShareMemGiB: shmMB > 0, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from remote error,%v", err) + return nil, err + } + if len(specs) == 1 { + return specs[0], nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil +} + +func findCloudbrainOneSpecFromConfig(task *models.Cloudbrain) (*models.Specification, error) { + //find from config + var specConfig *models.ResourceSpec + hasSpec := false + if task.JobType == string(models.JobTypeTrain) { + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else { + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + + } + } + } + if !hasSpec && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + + for _, spec := range specialPool.ResourceSpec { + if task.ResourceSpecId == spec.Id { + hasSpec = true + specConfig = spec + break + } + } + } + } + } + if specConfig == nil { + log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId) + return nil, nil + } + if task.ComputeResource == "CPU/GPU" { + task.ComputeResource = models.GPU + } + + shareMemMiB := float32(specConfig.ShareMemMiB) / 1024 + if specConfig.ShareMemMiB == 103600 { + shareMemMiB = 100 + } else if specConfig.ShareMemMiB == 51800 { + shareMemMiB = 50 + } + opt := models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: specConfig.GpuNum, + UseAccCardsNum: true, + CpuCores: specConfig.CpuNum, + UseCpuCores: true, + MemGiB: float32(specConfig.MemMiB) / 1024, + UseMemGiB: true, + ShareMemGiB: shareMemMiB, + UseShareMemGiB: true, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from config error,%v", err) + return nil, err + } + if len(specs) > 1 { + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + return specs[0], nil +} + +func getCloudbrainTwoSpec(task *models.Cloudbrain) (*models.Specification, error) { + specMap, err := models.GetCloudbrainTwoSpecs() + if err != nil { + log.Error("InitCloudbrainTwoSpecs err.%v", err) + return nil, err + } + if task.FlavorCode != "" { + return specMap[task.FlavorCode], nil + } + time.Sleep(200 * time.Millisecond) + log.Info("start getCloudbrainTwoSpec FromRemote") + if task.JobType == string(models.JobTypeDebug) { + result, err := modelarts.GetNotebook2(task.JobID) + if err != nil { + log.Error("getCloudbrainTwoSpec GetNotebook2 error.%v", err) + return nil, err + } + if result != nil { + return specMap[result.Flavor], nil + } + } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) { + result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("getCloudbrainTwoSpec GetTrainJob error:%v", task.JobName, err) + return nil, err + } + if result != nil { + return specMap[result.Flavor.Code], nil + } + } + return nil, nil +} + +func getGrampusSpec(task *models.Cloudbrain) (*models.Specification, error) { + specMap, err := models.GetGrampusSpecs() + if err != nil { + log.Error("GetGrampusSpecs err.%v", err) + return nil, err + } + if task.AiCenter != "" { + c := strings.Split(task.AiCenter, "+") + spec := specMap[task.FlavorCode+"_"+c[0]] + if spec != nil { + return spec, nil + } + } + return specMap[task.FlavorCode], nil +} + +func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) { + return models.InitQueueAndSpec(models.ResourceQueue{ + QueueCode: opt.QueueCode, + Cluster: opt.Cluster, + AiCenterCode: opt.AiCenterCode, + AiCenterName: aiCenterName, + ComputeResource: opt.ComputeResource, + AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode), + Remark: remark, + }, models.ResourceSpecification{ + AccCardsNum: opt.AccCardsNum, + CpuCores: opt.CpuCores, + MemGiB: opt.MemGiB, + GPUMemGiB: opt.GPUMemGiB, + ShareMemGiB: opt.ShareMemGiB, + Status: models.SpecOffShelf, + IsAvailable: true, + }) +} diff --git a/templates/admin/cloudbrain/list.tmpl b/templates/admin/cloudbrain/list.tmpl index 83510f268..4bac45f2b 100755 --- a/templates/admin/cloudbrain/list.tmpl +++ b/templates/admin/cloudbrain/list.tmpl @@ -1,4 +1,5 @@ {{template "base/head" .}} +
@@ -175,10 +176,17 @@
- - {{if .CardType}}{{.CardType}}{{else}}--{{end}} - -
+ +
+
{{if .User.Name}} diff --git a/templates/admin/resources/queue.tmpl b/templates/admin/resources/queue.tmpl index 3f2d83b99..13c30690a 100644 --- a/templates/admin/resources/queue.tmpl +++ b/templates/admin/resources/queue.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/admin/resources/scene.tmpl b/templates/admin/resources/scene.tmpl index 53af0352c..f0a94a703 100644 --- a/templates/admin/resources/scene.tmpl +++ b/templates/admin/resources/scene.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/admin/resources/specification.tmpl b/templates/admin/resources/specification.tmpl index 34992c5c9..265aafa81 100644 --- a/templates/admin/resources/specification.tmpl +++ b/templates/admin/resources/specification.tmpl @@ -4,7 +4,7 @@ {{template "admin/navbar" .}}
- +
{{template "base/footer" .}} diff --git a/templates/custom/task_wait_count.tmpl b/templates/custom/task_wait_count.tmpl new file mode 100644 index 000000000..fb8ee71fb --- /dev/null +++ b/templates/custom/task_wait_count.tmpl @@ -0,0 +1,25 @@ +
+
+ + {{.i18n.Tr "repo.wait_count_start"}} {{.WaitCount}} {{.i18n.Tr "repo.wait_count_end"}} +
+
+ diff --git a/templates/explore/images.tmpl b/templates/explore/images.tmpl index a79db4c07..90913d89c 100644 --- a/templates/explore/images.tmpl +++ b/templates/explore/images.tmpl @@ -1,24 +1,25 @@ {{template "base/head" .}} -
-
+
-
-{{template "base/footer" .}} \ No newline at end of file +{{template "base/footer" .}} diff --git a/templates/org/navber.tmpl b/templates/org/navber.tmpl index c2c3d51c0..7950d2f27 100755 --- a/templates/org/navber.tmpl +++ b/templates/org/navber.tmpl @@ -36,10 +36,7 @@ --> diff --git a/web_src/js/features/cloudbrainShow.js b/web_src/js/features/cloudbrainShow.js index 73a8ed7b6..229672f20 100644 --- a/web_src/js/features/cloudbrainShow.js +++ b/web_src/js/features/cloudbrainShow.js @@ -130,11 +130,14 @@ export default async function initCloudrainSow() { let repoPath = $(`#accordion${version_name}`).data("repopath"); $(`#log_file${version_name}`).siblings("pre").remove(); let end_line = $(`#log${version_name} input[name=end_line]`).val(); - $(".ui.inverted.active.dimmer").css("display", "block"); + $(`#log${version_name} .ui.inverted.active.dimmer`).css("display", "block"); $.get( `/api/v1/repos/${repoPath}/${ID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => { - $(".ui.inverted.active.dimmer").css("display", "none"); + $(`#log${version_name} .ui.inverted.active.dimmer`).css( + "display", + "none" + ); if (!data.CanLogDownload) { $(`#${version_name}-log-down`) .removeClass("ti-download-file") diff --git a/web_src/js/standalone/specsuse.js b/web_src/js/standalone/specsuse.js new file mode 100644 index 000000000..1b9c075f9 --- /dev/null +++ b/web_src/js/standalone/specsuse.js @@ -0,0 +1,31 @@ +window.ACC_CARD_TYPE = [{ k: 'T4', v: 'T4' }, { k: 'A100', v: 'A100' }, { k: 'V100', v: 'V100' }, { k: 'ASCEND910', v: 'Ascend 910' }, { k: 'MLU270', v: 'MLU270' }, { k: 'RTX3080', v: 'RTX3080' }]; + +window.getListValueWithKey = (list, key, k = 'k', v = 'v', defaultV = '') => { + for (let i = 0, iLen = list.length; i < iLen; i++) { + const listI = list[i]; + if (listI[k] === key) return listI[v]; + } + return defaultV; +}; + +window.renderSpecStr = (spec, showPoint, langObj) => { + showPoint = false; + var ngpu = `${spec.ComputeResource}: ${spec.AccCardsNum + '*' + getListValueWithKey(ACC_CARD_TYPE, spec.AccCardType)}`; + var gpuMemStr = spec.GPUMemGiB != 0 ? `${langObj.gpu_memory}: ${spec.GPUMemGiB}GB, ` : ''; + var sharedMemStr = spec.ShareMemGiB != 0 ? `, ${langObj.shared_memory}: ${spec.ShareMemGiB}GB` : ''; + var pointStr = showPoint ? `, ${spec.UnitPrice == 0 ? langObj.free : spec.UnitPrice + langObj.point_hr}` : ''; + var specStr = `${ngpu}, CPU: ${spec.CpuCores}, ${gpuMemStr}${langObj.memory}: ${spec.MemGiB}GB${sharedMemStr}${pointStr}`; + return specStr; +}; + +window.renderSpecsSelect = (specsSel, data, showPoint, langObj) => { + specsSel.empty(); + data = data || []; + var oValue = specsSel.attr('ovalue'); + for (var i = 0, iLen = data.length; i < iLen; i++) { + var spec = data[i]; + var specStr = window.renderSpecStr(spec, showPoint, langObj); + specsSel.append(``); + } + oValue && specsSel.val(oValue); +} diff --git a/web_src/vuepages/const/index.js b/web_src/vuepages/const/index.js index dde164344..fe133fb87 100644 --- a/web_src/vuepages/const/index.js +++ b/web_src/vuepages/const/index.js @@ -10,7 +10,7 @@ export const JOB_TYPE = [{ k: 'DEBUG', v: i18n.t('debugTask') }, { k: 'TRAIN', v // 资源管理 export const CLUSTERS = [{ k: 'OpenI', v: i18n.t('resourcesManagement.OpenI') }, { k: 'C2Net', v: i18n.t('resourcesManagement.C2Net') }]; -export const AI_CENTER = [{ k: 'OpenIOne', v: i18n.t('resourcesManagement.OpenIOne') }, { k: 'OpenITwo', v: i18n.t('resourcesManagement.OpenITwo') }, { k: 'chendu', v: i18n.t('resourcesManagement.chenduCenter') }, { k: 'pclcci', v: i18n.t('resourcesManagement.pclcci') }, { k: 'hefei', v: i18n.t('resourcesManagement.hefeiCenter') }, { k: 'xuchang', v: i18n.t('resourcesManagement.xuchangCenter') }]; +export const AI_CENTER = [{ k: 'OpenIOne', v: i18n.t('resourcesManagement.OpenIOne') }, { k: 'OpenITwo', v: i18n.t('resourcesManagement.OpenITwo') }, { k: 'OpenIChengdu', v: i18n.t('resourcesManagement.OpenIChengdu') }, { k: 'pclcci', v: i18n.t('resourcesManagement.pclcci') }, { k: 'hefei', v: i18n.t('resourcesManagement.hefeiCenter') }, { k: 'xuchang', v: i18n.t('resourcesManagement.xuchangCenter') }]; export const COMPUTER_RESOURCES = [{ k: 'GPU', v: 'GPU' }, { k: 'NPU', v: 'NPU' }, { k: 'MLU', v: 'MLU' }]; export const ACC_CARD_TYPE = [{ k: 'T4', v: 'T4' }, { k: 'A100', v: 'A100' }, { k: 'V100', v: 'V100' }, { k: 'ASCEND910', v: 'Ascend 910' }, { k: 'MLU270', v: 'MLU270' }, { k: 'RTX3080', v: 'RTX3080' }]; export const SPECIFICATION_STATUS = [{ k: '1', v: i18n.t('resourcesManagement.willOnShelf') }, { k: '2', v: i18n.t('resourcesManagement.onShelf') }, { k: '3', v: i18n.t('resourcesManagement.offShelf') }]; diff --git a/web_src/vuepages/langs/config/en-US.js b/web_src/vuepages/langs/config/en-US.js index c25b66b5a..9a4cee49e 100644 --- a/web_src/vuepages/langs/config/en-US.js +++ b/web_src/vuepages/langs/config/en-US.js @@ -82,7 +82,8 @@ const en = { C2Net: 'C2Net', OpenIOne: 'OpenI One', OpenITwo: 'OpenI Two', - chenduCenter: 'ChenDu AI Center', + OpenIChengdu: 'OpenI ChengDu AI Chenter', + chengduCenter: 'ChengDu AI Center', pclcci: 'PCL Cloud Computer Institute', hefeiCenter: 'HeFei AI Center', xuchangCenter: 'XuChang AI Center', @@ -131,6 +132,7 @@ const en = { onShelfConfirm: 'Are you sure to on shelf the resources specification?', offShelfConfirm: 'Are you sure to off shelf the resources specification?', onShelfCode1001: 'On shelf failed, the resources queues not available.', + onShelfCode1003: 'On shelf failed, the resources specification not available.', offShelfDlgTip1: 'The resources specification has already used in scene:', offShelfDlgTip2: 'Please confirm to off shelf?', resSceneManagement: 'Resources Scene Management', @@ -149,7 +151,11 @@ const en = { computeCluster: 'Compute Cluster', resourceSpecification: 'Resource Specification', lastUpdateTime: 'Last Update Time', - resSceneDeleteConfirm: 'Are you sure to delete the current Resource Scene?', + resSceneDeleteConfirm: 'Are you sure to delete the current Resource Scene?', + resourceSpecificationIsAvailable: 'Specification Is Available', + resourceSpecificationIsAvailableAll: 'Specification Is Available(All)', + available: 'Available', + notAvailable: 'Not Available', }, } diff --git a/web_src/vuepages/langs/config/zh-CN.js b/web_src/vuepages/langs/config/zh-CN.js index c593cd684..8cfc3f101 100644 --- a/web_src/vuepages/langs/config/zh-CN.js +++ b/web_src/vuepages/langs/config/zh-CN.js @@ -82,7 +82,8 @@ const zh = { C2Net: '智算集群', OpenIOne: '云脑一', OpenITwo: '云脑二', - chenduCenter: '成都人工智能计算中心', + OpenIChengdu: '启智成都智算', + chengduCenter: '成都智算', pclcci: '鹏城云计算所', hefeiCenter: '合肥类脑类脑智能开放平台', xuchangCenter: '中原人工智能计算中心', @@ -131,6 +132,7 @@ const zh = { onShelfConfirm: '请确认上架该规格?', offShelfConfirm: '请确认下架该规格?', onShelfCode1001: '上架失败,资源池(队列)不可用。', + onShelfCode1003: '上架失败,资源规格不可用。', offShelfDlgTip1: '当前资源规格已在以下场景中使用:', offShelfDlgTip2: '请确认进行下架操作?', resSceneManagement: '算力资源应用场景管理', @@ -150,6 +152,10 @@ const zh = { resourceSpecification: '资源规格', lastUpdateTime: '最后更新时间', resSceneDeleteConfirm: '是否确认删除当前应用场景?', + resourceSpecificationIsAvailable: '资源规格是否可用', + resourceSpecificationIsAvailableAll: '资源规格是否可用(全部)', + available: '可用', + notAvailable: '不可用', }, } diff --git a/web_src/vuepages/pages/resources/components/QueueDialog.vue b/web_src/vuepages/pages/resources/components/QueueDialog.vue index 6e44e45c0..0e776f88b 100644 --- a/web_src/vuepages/pages/resources/components/QueueDialog.vue +++ b/web_src/vuepages/pages/resources/components/QueueDialog.vue @@ -10,7 +10,8 @@ {{ $t('resourcesManagement.resQueueName') }}
- + +
@@ -101,7 +102,7 @@ export default { return { dialogShow: false, clusterList: [CLUSTERS[0]], - computingCenterList: [AI_CENTER[0], AI_CENTER[1]], + computingCenterList: [AI_CENTER[0], AI_CENTER[1], AI_CENTER[2]], computingTypeList: [...COMPUTER_RESOURCES], cardTypeList: [...ACC_CARD_TYPE], diff --git a/web_src/vuepages/pages/resources/components/SceneDialog.vue b/web_src/vuepages/pages/resources/components/SceneDialog.vue index c47fa794c..3b77be6e2 100644 --- a/web_src/vuepages/pages/resources/components/SceneDialog.vue +++ b/web_src/vuepages/pages/resources/components/SceneDialog.vue @@ -88,7 +88,7 @@