You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

resource_specification.go 19 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. package resource
  2. import (
  3. "code.gitea.io/gitea/models"
  4. "code.gitea.io/gitea/modules/cloudbrain"
  5. "code.gitea.io/gitea/modules/grampus"
  6. "code.gitea.io/gitea/modules/log"
  7. "code.gitea.io/gitea/modules/modelarts"
  8. "code.gitea.io/gitea/modules/setting"
  9. "code.gitea.io/gitea/routers/response"
  10. "code.gitea.io/gitea/services/admin/operate_log"
  11. "encoding/json"
  12. "errors"
  13. "fmt"
  14. "strconv"
  15. "strings"
  16. "time"
  17. )
  18. func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error {
  19. if req.Status == 0 {
  20. req.Status = models.SpecNotVerified
  21. }
  22. spec := req.ToDTO()
  23. if _, err := models.InsertResourceSpecification(spec); err != nil {
  24. return err
  25. }
  26. return nil
  27. }
  28. func UpdateSpecUnitPrice(doerId int64, specId int64, unitPrice int) *response.BizError {
  29. oldSpec, err := models.GetResourceSpecification(&models.ResourceSpecification{ID: specId})
  30. if err != nil {
  31. return response.NewBizError(err)
  32. }
  33. if oldSpec == nil {
  34. return response.SPECIFICATION_NOT_EXIST
  35. }
  36. err = models.UpdateSpecUnitPriceById(specId, unitPrice)
  37. if err != nil {
  38. return response.NewBizError(err)
  39. }
  40. if oldSpec.UnitPrice != unitPrice {
  41. AddSpecOperateLog(doerId, "edit", operate_log.NewLogValues().Add("unitPrice", unitPrice), operate_log.NewLogValues().Add("unitPrice", oldSpec.UnitPrice), specId, fmt.Sprintf("修改资源规格单价从%d积分到%d积分", oldSpec.UnitPrice, unitPrice))
  42. }
  43. return nil
  44. }
  45. func SyncGrampusSpecs(doerId int64) error {
  46. r, err := grampus.GetResourceSpecs("")
  47. if err != nil {
  48. return err
  49. }
  50. log.Info("SyncGrampusSpecs result = %+v", r)
  51. specUpdateList := make([]models.ResourceSpecification, 0)
  52. specInsertList := make([]models.ResourceSpecification, 0)
  53. existIds := make([]int64, 0)
  54. for _, spec := range r.Infos {
  55. for _, c := range spec.Centers {
  56. computeResource := models.ParseComputeResourceFormGrampus(spec.SpecInfo.AccDeviceKind)
  57. if computeResource == "" {
  58. continue
  59. }
  60. accCardType := strings.ToUpper(spec.SpecInfo.AccDeviceModel)
  61. memGiB, err := models.ParseMemSizeFromGrampus(spec.SpecInfo.MemorySize)
  62. gpuMemGiB, err := models.ParseMemSizeFromGrampus(spec.SpecInfo.AccDeviceMemory)
  63. if err != nil {
  64. log.Error("ParseMemSizeFromGrampus error. MemorySize=%s AccDeviceMemory=%s", spec.SpecInfo.MemorySize, spec.SpecInfo.AccDeviceMemory)
  65. }
  66. // get resource queue.if queue not exist,skip it
  67. r, err := models.GetResourceQueue(&models.ResourceQueue{
  68. Cluster: models.C2NetCluster,
  69. AiCenterCode: c.ID,
  70. ComputeResource: computeResource,
  71. AccCardType: accCardType,
  72. })
  73. if err != nil || r == nil {
  74. continue
  75. }
  76. //Determine if this specification already exists.if exist,update params
  77. //if not exist,insert a new record and status is SpecNotVerified
  78. oldSpec, err := models.GetResourceSpecification(&models.ResourceSpecification{
  79. QueueId: r.ID,
  80. SourceSpecId: spec.ID,
  81. })
  82. if err != nil {
  83. return err
  84. }
  85. if oldSpec == nil {
  86. specInsertList = append(specInsertList, models.ResourceSpecification{
  87. QueueId: r.ID,
  88. SourceSpecId: spec.ID,
  89. AccCardsNum: spec.SpecInfo.AccDeviceNum,
  90. CpuCores: spec.SpecInfo.CpuCoreNum,
  91. MemGiB: memGiB,
  92. GPUMemGiB: gpuMemGiB,
  93. Status: models.SpecNotVerified,
  94. IsAutomaticSync: true,
  95. IsAvailable: true,
  96. CreatedBy: doerId,
  97. UpdatedBy: doerId,
  98. })
  99. } else {
  100. existIds = append(existIds, oldSpec.ID)
  101. specUpdateList = append(specUpdateList, models.ResourceSpecification{
  102. ID: oldSpec.ID,
  103. AccCardsNum: spec.SpecInfo.AccDeviceNum,
  104. CpuCores: spec.SpecInfo.CpuCoreNum,
  105. MemGiB: memGiB,
  106. GPUMemGiB: gpuMemGiB,
  107. IsAvailable: true,
  108. UpdatedBy: doerId,
  109. })
  110. }
  111. }
  112. }
  113. return models.SyncGrampusSpecs(specUpdateList, specInsertList, existIds)
  114. }
  115. //GetResourceSpecificationList returns specification and queue
  116. func GetResourceSpecificationList(opts models.SearchResourceSpecificationOptions) (*models.ResourceSpecAndQueueListRes, error) {
  117. n, r, err := models.SearchResourceSpecification(opts)
  118. if err != nil {
  119. return nil, err
  120. }
  121. return models.NewResourceSpecAndQueueListRes(n, r), nil
  122. }
  123. func GetResourceSpecificationScenes(specId int64) ([]models.ResourceSceneBriefRes, error) {
  124. r, err := models.GetSpecScenes(specId)
  125. if err != nil {
  126. return nil, err
  127. }
  128. return r, nil
  129. }
  130. func ResourceSpecOnShelf(doerId int64, id int64, unitPrice int) *response.BizError {
  131. spec, err := models.GetResourceSpecification(&models.ResourceSpecification{ID: id})
  132. if err != nil {
  133. return response.NewBizError(err)
  134. }
  135. if spec == nil {
  136. return response.SPECIFICATION_NOT_EXIST
  137. }
  138. if q, err := models.GetResourceQueue(&models.ResourceQueue{ID: spec.QueueId}); err != nil || q == nil {
  139. return response.RESOURCE_QUEUE_NOT_AVAILABLE
  140. }
  141. if !spec.IsAvailable {
  142. return response.SPECIFICATION_NOT_AVAILABLE
  143. }
  144. err = models.ResourceSpecOnShelf(id, unitPrice)
  145. if err != nil {
  146. return response.NewBizError(err)
  147. }
  148. if spec.UnitPrice != unitPrice {
  149. AddSpecOperateLog(doerId, "on-shelf", operate_log.NewLogValues().Add("UnitPrice", unitPrice), operate_log.NewLogValues().Add("UnitPrice", spec.UnitPrice), id, fmt.Sprintf("定价上架资源规格,单价为%d", unitPrice))
  150. } else {
  151. AddSpecOperateLog(doerId, "on-shelf", nil, nil, id, "上架资源规格")
  152. }
  153. return nil
  154. }
  155. func ResourceSpecOffShelf(doerId int64, id int64) *response.BizError {
  156. _, err := models.ResourceSpecOffShelf(id)
  157. if err != nil {
  158. return response.NewBizError(err)
  159. }
  160. AddSpecOperateLog(doerId, "off-shelf", nil, nil, id, "下架资源规格")
  161. return nil
  162. }
  163. func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *models.LogValues, specId int64, comment string) {
  164. var newString = ""
  165. var oldString = ""
  166. if newValue != nil {
  167. newString = newValue.JsonString()
  168. }
  169. if oldValue != nil {
  170. oldString = oldValue.JsonString()
  171. }
  172. operate_log.Log(models.AdminOperateLog{
  173. BizType: "SpecOperate",
  174. OperateType: operateType,
  175. OldValue: oldString,
  176. NewValue: newString,
  177. RelatedId: fmt.Sprint(specId),
  178. CreatedBy: doerId,
  179. Comment: comment,
  180. })
  181. }
  182. func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) {
  183. r, err := models.FindSpecs(opts)
  184. if err != nil {
  185. log.Error("FindAvailableSpecs error.%v", err)
  186. return nil, err
  187. }
  188. //filter exclusive specs
  189. specs := filterExclusiveSpecs(r, userId)
  190. //distinct by sourceSpecId
  191. specs = distinctSpecs(specs)
  192. return specs, err
  193. }
  194. func FindAvailableSpecs4Show(userId int64, opts models.FindSpecsOptions) ([]*models.SpecificationShow, error) {
  195. specs, err := FindAvailableSpecs(userId, opts)
  196. if err != nil {
  197. return nil, err
  198. }
  199. result := make([]*models.SpecificationShow, len(specs))
  200. for i, v := range specs {
  201. result[i] = v.ToShow()
  202. }
  203. return result, nil
  204. }
  205. func filterExclusiveSpecs(r []*models.Specification, userId int64) []*models.Specification {
  206. specs := make([]*models.Specification, 0, len(r))
  207. specMap := make(map[int64]string, 0)
  208. for i := 0; i < len(r); i++ {
  209. spec := r[i]
  210. if _, has := specMap[spec.ID]; has {
  211. continue
  212. }
  213. if !spec.IsExclusive {
  214. specs = append(specs, spec)
  215. specMap[spec.ID] = ""
  216. continue
  217. }
  218. orgs := strings.Split(spec.ExclusiveOrg, ";")
  219. for _, org := range orgs {
  220. isMember, _ := models.IsOrganizationMemberByOrgName(org, userId)
  221. if isMember {
  222. specs = append(specs, spec)
  223. specMap[spec.ID] = ""
  224. break
  225. }
  226. }
  227. }
  228. return specs
  229. }
  230. func distinctSpecs(r []*models.Specification) []*models.Specification {
  231. specs := make([]*models.Specification, 0, len(r))
  232. sourceSpecIdMap := make(map[string]string, 0)
  233. for i := 0; i < len(r); i++ {
  234. spec := r[i]
  235. if spec.SourceSpecId == "" {
  236. specs = append(specs, spec)
  237. continue
  238. }
  239. if _, has := sourceSpecIdMap[spec.SourceSpecId]; has {
  240. continue
  241. }
  242. specs = append(specs, spec)
  243. sourceSpecIdMap[spec.SourceSpecId] = ""
  244. }
  245. return specs
  246. }
  247. func GetAndCheckSpec(userId int64, specId int64, opts models.FindSpecsOptions) (*models.Specification, error) {
  248. if specId == 0 {
  249. return nil, nil
  250. }
  251. opts.SpecId = specId
  252. r, err := FindAvailableSpecs(userId, opts)
  253. if err != nil {
  254. return nil, err
  255. }
  256. if r == nil || len(r) == 0 {
  257. return nil, nil
  258. }
  259. return r[0], nil
  260. }
  261. func InsertCloudbrainSpec(cloudbrainId int64, s *models.Specification) error {
  262. c := models.CloudbrainSpec{
  263. CloudbrainID: cloudbrainId,
  264. SpecId: s.ID,
  265. SourceSpecId: s.SourceSpecId,
  266. AccCardsNum: s.AccCardsNum,
  267. AccCardType: s.AccCardType,
  268. CpuCores: s.CpuCores,
  269. MemGiB: s.MemGiB,
  270. GPUMemGiB: s.GPUMemGiB,
  271. ShareMemGiB: s.ShareMemGiB,
  272. ComputeResource: s.ComputeResource,
  273. UnitPrice: s.UnitPrice,
  274. QueueId: s.QueueId,
  275. QueueCode: s.QueueCode,
  276. Cluster: s.Cluster,
  277. AiCenterCode: s.AiCenterCode,
  278. AiCenterName: s.AiCenterName,
  279. IsExclusive: s.IsExclusive,
  280. ExclusiveOrg: s.ExclusiveOrg,
  281. }
  282. _, err := models.InsertCloudbrainSpec(c)
  283. if err != nil {
  284. log.Error("InsertCloudbrainSpec error.CloudbrainSpec=%v. err=%v", c, err)
  285. return err
  286. }
  287. return nil
  288. }
  289. func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) {
  290. c, err := models.GetCloudbrainSpecByID(cloudbrainId)
  291. if err != nil {
  292. return nil, err
  293. }
  294. if c == nil {
  295. return nil, nil
  296. }
  297. return c.ConvertToSpecification(), nil
  298. }
  299. func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) {
  300. var success int64
  301. var total int64
  302. if !scopeAll {
  303. if ids == nil || len(ids) == 0 {
  304. return 0, 0, nil
  305. }
  306. total = int64(len(ids))
  307. tasks, err := models.GetCloudbrainWithDeletedByIDs(ids)
  308. if err != nil {
  309. return total, 0, err
  310. }
  311. for _, task := range tasks {
  312. err = RefreshOneHistorySpec(task)
  313. if err != nil {
  314. log.Error("RefreshOneHistorySpec error.%v", err)
  315. continue
  316. }
  317. success++
  318. }
  319. } else {
  320. page := 1
  321. pageSize := 100
  322. n, err := models.CountNoSpecHistoricTask()
  323. if err != nil {
  324. log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err)
  325. return 0, 0, err
  326. }
  327. total = n
  328. for i := 0; i < 500; i++ {
  329. list, err := models.FindCloudbrainTask(page, pageSize)
  330. page++
  331. if err != nil {
  332. log.Error("FindCloudbrainTask error.page=%d pageSize=%d e=%v", page, pageSize, err)
  333. return total, success, err
  334. }
  335. if len(list) == 0 {
  336. log.Info("RefreshHistorySpec. list is empty")
  337. break
  338. }
  339. for _, task := range list {
  340. s, err := GetCloudbrainSpec(task.ID)
  341. if err != nil {
  342. log.Error("RefreshHistorySpec GetCloudbrainSpec error.%v", err)
  343. continue
  344. }
  345. if s != nil {
  346. continue
  347. }
  348. err = RefreshOneHistorySpec(task)
  349. if err != nil {
  350. log.Error("RefreshOneHistorySpec error.%v", err)
  351. continue
  352. }
  353. success++
  354. }
  355. if len(list) < pageSize {
  356. log.Info("RefreshHistorySpec. list < pageSize")
  357. break
  358. }
  359. }
  360. }
  361. return total, success, nil
  362. }
  363. func RefreshOneHistorySpec(task *models.Cloudbrain) error {
  364. var spec *models.Specification
  365. var err error
  366. switch task.Type {
  367. case models.TypeCloudBrainOne:
  368. spec, err = getCloudbrainOneSpec(task)
  369. case models.TypeCloudBrainTwo:
  370. spec, err = getCloudbrainTwoSpec(task)
  371. case models.TypeC2Net:
  372. spec, err = getGrampusSpec(task)
  373. }
  374. if err != nil {
  375. log.Error("find spec error,task.ID=%d err=%v", task.ID, err)
  376. return err
  377. }
  378. if spec == nil {
  379. log.Error("find spec failed,task.ID=%d", task.ID)
  380. return errors.New("find spec failed")
  381. }
  382. return InsertCloudbrainSpec(task.ID, spec)
  383. }
  384. func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) {
  385. if task.GpuQueue == "" {
  386. log.Info("gpu queue is empty.task.ID = %d", task.ID)
  387. return nil, nil
  388. }
  389. //find from config
  390. spec, err := findCloudbrainOneSpecFromConfig(task)
  391. if err != nil {
  392. log.Error("getCloudbrainOneSpec findCloudbrainOneSpecFromConfig error.%v", err)
  393. return nil, err
  394. }
  395. if spec != nil {
  396. return spec, nil
  397. }
  398. //find from remote
  399. return findCloudbrainOneSpecFromRemote(task)
  400. }
  401. func findCloudbrainOneSpecFromRemote(task *models.Cloudbrain) (*models.Specification, error) {
  402. time.Sleep(200 * time.Millisecond)
  403. log.Info("start findCloudbrainOneSpecFromRemote")
  404. result, err := cloudbrain.GetJob(task.JobID)
  405. if err != nil {
  406. log.Error("getCloudbrainOneSpec error. %v", err)
  407. return nil, err
  408. }
  409. if result == nil {
  410. log.Info("findCloudbrainOneSpecFromRemote failed,result is empty.task.ID=%d", task.ID)
  411. return nil, nil
  412. }
  413. jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
  414. memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory)
  415. if task.ComputeResource == "CPU/GPU" {
  416. task.ComputeResource = models.GPU
  417. }
  418. var shmMB float32
  419. if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 {
  420. shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024
  421. if jobRes.Config.TaskRoles[0].ShmMB == 103600 {
  422. shmMB = 100
  423. } else if jobRes.Config.TaskRoles[0].ShmMB == 51800 {
  424. shmMB = 50
  425. }
  426. }
  427. opt := models.FindSpecsOptions{
  428. ComputeResource: task.ComputeResource,
  429. Cluster: models.OpenICluster,
  430. AiCenterCode: models.AICenterOfCloudBrainOne,
  431. QueueCode: task.GpuQueue,
  432. AccCardsNum: jobRes.Resource.NvidiaComGpu,
  433. UseAccCardsNum: true,
  434. CpuCores: jobRes.Resource.CPU,
  435. UseCpuCores: true,
  436. MemGiB: memSize,
  437. UseMemGiB: memSize > 0,
  438. ShareMemGiB: shmMB,
  439. UseShareMemGiB: shmMB > 0,
  440. RequestAll: true,
  441. }
  442. specs, err := models.FindSpecs(opt)
  443. if err != nil {
  444. log.Error("getCloudbrainOneSpec from remote error,%v", err)
  445. return nil, err
  446. }
  447. if len(specs) == 1 {
  448. return specs[0], nil
  449. }
  450. if len(specs) == 0 {
  451. s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
  452. if err != nil {
  453. log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
  454. return nil, nil
  455. }
  456. return s, nil
  457. }
  458. log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
  459. return nil, nil
  460. }
  461. func findCloudbrainOneSpecFromConfig(task *models.Cloudbrain) (*models.Specification, error) {
  462. //find from config
  463. var specConfig *models.ResourceSpec
  464. hasSpec := false
  465. if task.JobType == string(models.JobTypeTrain) {
  466. if cloudbrain.TrainResourceSpecs == nil {
  467. json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
  468. }
  469. for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec {
  470. if tmp.Id == task.ResourceSpecId {
  471. hasSpec = true
  472. specConfig = tmp
  473. break
  474. }
  475. }
  476. } else if task.JobType == string(models.JobTypeInference) {
  477. if cloudbrain.InferenceResourceSpecs == nil {
  478. json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
  479. }
  480. for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec {
  481. if tmp.Id == task.ResourceSpecId {
  482. hasSpec = true
  483. specConfig = tmp
  484. break
  485. }
  486. }
  487. } else {
  488. if cloudbrain.ResourceSpecs == nil {
  489. json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
  490. }
  491. for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec {
  492. if tmp.Id == task.ResourceSpecId {
  493. hasSpec = true
  494. specConfig = tmp
  495. break
  496. }
  497. }
  498. }
  499. if !hasSpec && cloudbrain.SpecialPools != nil {
  500. for _, specialPool := range cloudbrain.SpecialPools.Pools {
  501. if specialPool.ResourceSpec != nil {
  502. for _, spec := range specialPool.ResourceSpec {
  503. if task.ResourceSpecId == spec.Id {
  504. hasSpec = true
  505. specConfig = spec
  506. break
  507. }
  508. }
  509. }
  510. }
  511. }
  512. if specConfig == nil {
  513. log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId)
  514. return nil, nil
  515. }
  516. if task.ComputeResource == "CPU/GPU" {
  517. task.ComputeResource = models.GPU
  518. }
  519. shareMemMiB := float32(specConfig.ShareMemMiB) / 1024
  520. if specConfig.ShareMemMiB == 103600 {
  521. shareMemMiB = 100
  522. } else if specConfig.ShareMemMiB == 51800 {
  523. shareMemMiB = 50
  524. }
  525. opt := models.FindSpecsOptions{
  526. JobType: models.JobType(task.JobType),
  527. ComputeResource: task.ComputeResource,
  528. Cluster: models.OpenICluster,
  529. AiCenterCode: models.AICenterOfCloudBrainOne,
  530. QueueCode: task.GpuQueue,
  531. AccCardsNum: specConfig.GpuNum,
  532. UseAccCardsNum: true,
  533. CpuCores: specConfig.CpuNum,
  534. UseCpuCores: true,
  535. MemGiB: float32(specConfig.MemMiB) / 1024,
  536. UseMemGiB: true,
  537. ShareMemGiB: shareMemMiB,
  538. UseShareMemGiB: true,
  539. RequestAll: true,
  540. }
  541. specs, err := models.FindSpecs(opt)
  542. if err != nil {
  543. log.Error("getCloudbrainOneSpec from config error,%v", err)
  544. return nil, err
  545. }
  546. if len(specs) > 1 {
  547. log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
  548. return nil, nil
  549. }
  550. if len(specs) == 0 {
  551. s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
  552. if err != nil {
  553. log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
  554. return nil, nil
  555. }
  556. return s, nil
  557. }
  558. return specs[0], nil
  559. }
  560. func getCloudbrainTwoSpec(task *models.Cloudbrain) (*models.Specification, error) {
  561. specMap, err := models.GetCloudbrainTwoSpecs()
  562. if err != nil {
  563. log.Error("InitCloudbrainTwoSpecs err.%v", err)
  564. return nil, err
  565. }
  566. if task.FlavorCode != "" {
  567. return specMap[task.FlavorCode], nil
  568. }
  569. time.Sleep(200 * time.Millisecond)
  570. log.Info("start getCloudbrainTwoSpec FromRemote")
  571. if task.JobType == string(models.JobTypeDebug) {
  572. result, err := modelarts.GetNotebook2(task.JobID)
  573. if err != nil {
  574. log.Error("getCloudbrainTwoSpec GetNotebook2 error.%v", err)
  575. return nil, err
  576. }
  577. if result != nil {
  578. return specMap[result.Flavor], nil
  579. }
  580. } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) {
  581. result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  582. if err != nil {
  583. log.Error("getCloudbrainTwoSpec GetTrainJob error:%v", task.JobName, err)
  584. return nil, err
  585. }
  586. if result != nil {
  587. return specMap[result.Flavor.Code], nil
  588. }
  589. }
  590. return nil, nil
  591. }
  592. func getGrampusSpec(task *models.Cloudbrain) (*models.Specification, error) {
  593. specMap, err := models.GetGrampusSpecs()
  594. if err != nil {
  595. log.Error("GetGrampusSpecs err.%v", err)
  596. return nil, err
  597. }
  598. if task.AiCenter != "" {
  599. c := strings.Split(task.AiCenter, "+")
  600. spec := specMap[task.FlavorCode+"_"+c[0]]
  601. if spec != nil {
  602. return spec, nil
  603. }
  604. }
  605. return specMap[task.FlavorCode], nil
  606. }
  607. func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) {
  608. return models.InitQueueAndSpec(models.ResourceQueue{
  609. QueueCode: opt.QueueCode,
  610. Cluster: opt.Cluster,
  611. AiCenterCode: opt.AiCenterCode,
  612. AiCenterName: aiCenterName,
  613. ComputeResource: opt.ComputeResource,
  614. AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode),
  615. Remark: remark,
  616. }, models.ResourceSpecification{
  617. AccCardsNum: opt.AccCardsNum,
  618. CpuCores: opt.CpuCores,
  619. MemGiB: opt.MemGiB,
  620. GPUMemGiB: opt.GPUMemGiB,
  621. ShareMemGiB: opt.ShareMemGiB,
  622. Status: models.SpecOffShelf,
  623. IsAvailable: true,
  624. })
  625. }