You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "strconv"
  7. "strings"
  8. "code.gitea.io/gitea/modules/modelarts_cd"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/timeutil"
  15. )
  16. const (
  17. //notebook
  18. autoStopDurationMs = 4 * 60 * 60 * 1000
  19. //train-job
  20. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  21. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  22. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  23. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  24. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  25. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  26. // "]}"
  27. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  28. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  29. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  30. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  31. // "]}"
  32. CodePath = "/code/"
  33. OutputPath = "/output/"
  34. ResultPath = "/result/"
  35. LogPath = "/log/"
  36. JobPath = "/job/"
  37. OrderDesc = "desc" //向下查询
  38. OrderAsc = "asc" //向上查询
  39. Lines = 500
  40. TrainUrl = "train_url"
  41. DataUrl = "data_url"
  42. MultiDataUrl = "multi_data_url"
  43. ResultUrl = "result_url"
  44. CkptUrl = "ckpt_url"
  45. DeviceTarget = "device_target"
  46. Ascend = "Ascend"
  47. PerPage = 10
  48. IsLatestVersion = "1"
  49. NotLatestVersion = "0"
  50. VersionCountOne = 1
  51. SortByCreateTime = "create_time"
  52. ConfigTypeCustom = "custom"
  53. TotalVersionCount = 1
  54. )
  55. var (
  56. poolInfos *models.PoolInfos
  57. TrainFlavorInfos *Flavor
  58. SpecialPools *models.SpecialPools
  59. MultiNodeConfig *MultiNodes
  60. )
  61. type GenerateTrainJobReq struct {
  62. JobName string
  63. DisplayJobName string
  64. Uuid string
  65. Description string
  66. CodeObsPath string
  67. BootFile string
  68. BootFileUrl string
  69. DataUrl string
  70. TrainUrl string
  71. LogUrl string
  72. PoolID string
  73. WorkServerNumber int
  74. EngineID int64
  75. Parameters []models.Parameter
  76. CommitID string
  77. IsLatestVersion string
  78. Params string
  79. BranchName string
  80. PreVersionId int64
  81. PreVersionName string
  82. FlavorCode string
  83. FlavorName string
  84. VersionCount int
  85. EngineName string
  86. TotalVersionCount int
  87. UserImageUrl string
  88. UserCommand string
  89. DatasetName string
  90. Spec *models.Specification
  91. ModelName string
  92. LabelName string
  93. CkptName string
  94. ModelVersion string
  95. PreTrainModelUrl string
  96. }
  97. type GenerateInferenceJobReq struct {
  98. JobName string
  99. DisplayJobName string
  100. Uuid string
  101. Description string
  102. CodeObsPath string
  103. BootFile string
  104. BootFileUrl string
  105. DataUrl string
  106. TrainUrl string
  107. LogUrl string
  108. PoolID string
  109. WorkServerNumber int
  110. EngineID int64
  111. Parameters []models.Parameter
  112. CommitID string
  113. Params string
  114. BranchName string
  115. FlavorName string
  116. EngineName string
  117. LabelName string
  118. IsLatestVersion string
  119. VersionCount int
  120. TotalVersionCount int
  121. ModelName string
  122. ModelVersion string
  123. CkptName string
  124. ResultUrl string
  125. Spec *models.Specification
  126. DatasetName string
  127. JobType string
  128. UserImageUrl string
  129. UserCommand string
  130. }
  131. type VersionInfo struct {
  132. Version []struct {
  133. ID int `json:"id"`
  134. Value string `json:"value"`
  135. Url string `json:"url"`
  136. } `json:"version"`
  137. }
  138. type Flavor struct {
  139. Info []struct {
  140. Code string `json:"code"`
  141. Value string `json:"value"`
  142. UnitPrice int64 `json:"unitPrice"`
  143. } `json:"flavor"`
  144. }
  145. type Engine struct {
  146. Info []struct {
  147. ID int `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"engine"`
  150. }
  151. type ResourcePool struct {
  152. Info []struct {
  153. ID string `json:"id"`
  154. Value string `json:"value"`
  155. } `json:"resource_pool"`
  156. }
  157. type MultiNodes struct {
  158. Info []OrgMultiNode `json:"multinode"`
  159. }
  160. type OrgMultiNode struct {
  161. Org string `json:"org"`
  162. Node []int `json:"node"`
  163. }
  164. type Parameters struct {
  165. Parameter []struct {
  166. Label string `json:"label"`
  167. Value string `json:"value"`
  168. } `json:"parameter"`
  169. }
  170. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  171. if poolInfos == nil {
  172. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  173. }
  174. imageName, err := GetNotebookImageName(imageId)
  175. if err != nil {
  176. log.Error("GetNotebookImageName failed: %v", err.Error())
  177. return err
  178. }
  179. createTime := timeutil.TimeStampNow()
  180. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  181. JobName: jobName,
  182. Description: description,
  183. Flavor: spec.SourceSpecId,
  184. Duration: autoStopDurationMs,
  185. ImageID: imageId,
  186. PoolID: poolInfos.PoolInfo[0].PoolId,
  187. Feature: models.NotebookFeature,
  188. Volume: models.VolumeReq{
  189. Capacity: setting.Capacity,
  190. Category: models.EVSCategory,
  191. Ownership: models.ManagedOwnership,
  192. },
  193. WorkspaceID: "0",
  194. })
  195. if err != nil {
  196. log.Error("createNotebook2 failed: %v", err.Error())
  197. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  198. log.Info("(%s)unknown error, set temp status", displayJobName)
  199. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  200. JobID: models.TempJobId,
  201. VersionID: models.TempVersionId,
  202. Status: models.TempJobStatus,
  203. Type: models.TypeCloudBrainTwo,
  204. JobName: jobName,
  205. JobType: string(models.JobTypeDebug),
  206. })
  207. if errTemp != nil {
  208. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  209. return errTemp
  210. }
  211. }
  212. return err
  213. }
  214. task := &models.Cloudbrain{
  215. Status: jobResult.Status,
  216. UserID: ctx.User.ID,
  217. RepoID: ctx.Repo.Repository.ID,
  218. JobID: jobResult.ID,
  219. JobName: jobName,
  220. FlavorCode: spec.SourceSpecId,
  221. DisplayJobName: displayJobName,
  222. JobType: string(models.JobTypeDebug),
  223. Type: models.TypeCloudBrainTwo,
  224. Uuid: uuid,
  225. ComputeResource: models.NPUResource,
  226. Image: imageName,
  227. Description: description,
  228. CreatedUnix: createTime,
  229. UpdatedUnix: createTime,
  230. Spec: spec,
  231. }
  232. err = models.CreateCloudbrain(task)
  233. if err != nil {
  234. return err
  235. }
  236. stringId := strconv.FormatInt(task.ID, 10)
  237. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  238. return nil
  239. }
  240. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  241. createTime := timeutil.TimeStampNow()
  242. var jobResult *models.CreateTrainJobResult
  243. var createErr error
  244. if req.EngineID < 0 {
  245. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  246. JobName: req.JobName,
  247. Description: req.Description,
  248. Config: models.UserImageConfig{
  249. WorkServerNum: req.WorkServerNumber,
  250. AppUrl: req.CodeObsPath,
  251. BootFileUrl: req.BootFileUrl,
  252. DataUrl: req.DataUrl,
  253. TrainUrl: req.TrainUrl,
  254. LogUrl: req.LogUrl,
  255. PoolID: req.PoolID,
  256. CreateVersion: true,
  257. Flavor: models.Flavor{
  258. Code: req.Spec.SourceSpecId,
  259. },
  260. Parameter: req.Parameters,
  261. UserImageUrl: req.UserImageUrl,
  262. UserCommand: req.UserCommand,
  263. },
  264. })
  265. } else {
  266. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  267. JobName: req.JobName,
  268. Description: req.Description,
  269. Config: models.Config{
  270. WorkServerNum: req.WorkServerNumber,
  271. AppUrl: req.CodeObsPath,
  272. BootFileUrl: req.BootFileUrl,
  273. DataUrl: req.DataUrl,
  274. EngineID: req.EngineID,
  275. TrainUrl: req.TrainUrl,
  276. LogUrl: req.LogUrl,
  277. PoolID: req.PoolID,
  278. CreateVersion: true,
  279. Flavor: models.Flavor{
  280. Code: req.Spec.SourceSpecId,
  281. },
  282. Parameter: req.Parameters,
  283. },
  284. })
  285. }
  286. if createErr != nil {
  287. log.Error("createTrainJob failed: %v", createErr.Error())
  288. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  289. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  290. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  291. JobID: models.TempJobId,
  292. VersionID: models.TempVersionId,
  293. Status: models.TempJobStatus,
  294. Type: models.TypeCloudBrainTwo,
  295. JobName: req.JobName,
  296. JobType: string(models.JobTypeTrain),
  297. })
  298. if errTemp != nil {
  299. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  300. return "", errTemp
  301. }
  302. }
  303. return "", createErr
  304. }
  305. jobID := strconv.FormatInt(jobResult.JobID, 10)
  306. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  307. Status: TransTrainJobStatus(jobResult.Status),
  308. UserID: ctx.User.ID,
  309. RepoID: ctx.Repo.Repository.ID,
  310. JobID: jobID,
  311. JobName: req.JobName,
  312. DisplayJobName: req.DisplayJobName,
  313. JobType: string(models.JobTypeTrain),
  314. Type: models.TypeCloudBrainTwo,
  315. VersionID: jobResult.VersionID,
  316. VersionName: jobResult.VersionName,
  317. Uuid: req.Uuid,
  318. DatasetName: req.DatasetName,
  319. CommitID: req.CommitID,
  320. IsLatestVersion: req.IsLatestVersion,
  321. ComputeResource: models.NPUResource,
  322. EngineID: req.EngineID,
  323. TrainUrl: req.TrainUrl,
  324. BranchName: req.BranchName,
  325. Parameters: req.Params,
  326. BootFile: req.BootFile,
  327. DataUrl: req.DataUrl,
  328. LogUrl: req.LogUrl,
  329. FlavorCode: req.Spec.SourceSpecId,
  330. Description: req.Description,
  331. WorkServerNumber: req.WorkServerNumber,
  332. FlavorName: req.FlavorName,
  333. EngineName: req.EngineName,
  334. VersionCount: req.VersionCount,
  335. TotalVersionCount: req.TotalVersionCount,
  336. CreatedUnix: createTime,
  337. UpdatedUnix: createTime,
  338. Spec: req.Spec,
  339. ModelName: req.ModelName,
  340. ModelVersion: req.ModelVersion,
  341. LabelName: req.LabelName,
  342. PreTrainModelUrl: req.PreTrainModelUrl,
  343. CkptName: req.CkptName,
  344. })
  345. if createErr != nil {
  346. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  347. return "", createErr
  348. }
  349. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  350. return jobID, nil
  351. }
  352. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  353. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  354. JobName: req.JobName,
  355. Description: req.Description,
  356. Config: models.UserImageConfig{
  357. WorkServerNum: req.WorkServerNumber,
  358. AppUrl: req.CodeObsPath,
  359. BootFileUrl: req.BootFileUrl,
  360. DataUrl: req.DataUrl,
  361. TrainUrl: req.TrainUrl,
  362. LogUrl: req.LogUrl,
  363. PoolID: req.PoolID,
  364. CreateVersion: true,
  365. Flavor: models.Flavor{
  366. Code: req.FlavorCode,
  367. },
  368. Parameter: req.Parameters,
  369. UserImageUrl: req.UserImageUrl,
  370. UserCommand: req.UserCommand,
  371. },
  372. })
  373. }
  374. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  375. createTime := timeutil.TimeStampNow()
  376. var jobResult *models.CreateTrainJobResult
  377. var createErr error
  378. if req.EngineID < 0 {
  379. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  380. Description: req.Description,
  381. Config: models.TrainJobVersionUserImageConfig{
  382. WorkServerNum: req.WorkServerNumber,
  383. AppUrl: req.CodeObsPath,
  384. BootFileUrl: req.BootFileUrl,
  385. DataUrl: req.DataUrl,
  386. TrainUrl: req.TrainUrl,
  387. LogUrl: req.LogUrl,
  388. PoolID: req.PoolID,
  389. Flavor: models.Flavor{
  390. Code: req.Spec.SourceSpecId,
  391. },
  392. Parameter: req.Parameters,
  393. PreVersionId: req.PreVersionId,
  394. UserImageUrl: req.UserImageUrl,
  395. UserCommand: req.UserCommand,
  396. },
  397. }, jobId)
  398. } else {
  399. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  400. Description: req.Description,
  401. Config: models.TrainJobVersionConfig{
  402. WorkServerNum: req.WorkServerNumber,
  403. AppUrl: req.CodeObsPath,
  404. BootFileUrl: req.BootFileUrl,
  405. DataUrl: req.DataUrl,
  406. EngineID: req.EngineID,
  407. TrainUrl: req.TrainUrl,
  408. LogUrl: req.LogUrl,
  409. PoolID: req.PoolID,
  410. Flavor: models.Flavor{
  411. Code: req.Spec.SourceSpecId,
  412. },
  413. Parameter: req.Parameters,
  414. PreVersionId: req.PreVersionId,
  415. },
  416. }, jobId)
  417. }
  418. if createErr != nil {
  419. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  420. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  421. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  422. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  423. JobID: jobId,
  424. VersionID: models.TempVersionId,
  425. Status: models.TempJobStatus,
  426. Type: models.TypeCloudBrainTwo,
  427. JobName: req.JobName,
  428. JobType: string(models.JobTypeTrain),
  429. })
  430. if errTemp != nil {
  431. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  432. return errTemp
  433. }
  434. }
  435. return createErr
  436. }
  437. var jobTypes []string
  438. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  439. repo := ctx.Repo.Repository
  440. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  441. RepoID: repo.ID,
  442. Type: models.TypeCloudBrainTwo,
  443. JobTypes: jobTypes,
  444. JobID: strconv.FormatInt(jobResult.JobID, 10),
  445. })
  446. if createErr != nil {
  447. ctx.ServerError("Cloudbrain", createErr)
  448. return createErr
  449. }
  450. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  451. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  452. Status: TransTrainJobStatus(jobResult.Status),
  453. UserID: ctx.User.ID,
  454. RepoID: ctx.Repo.Repository.ID,
  455. JobID: strconv.FormatInt(jobResult.JobID, 10),
  456. JobName: req.JobName,
  457. DisplayJobName: req.DisplayJobName,
  458. JobType: string(models.JobTypeTrain),
  459. Type: models.TypeCloudBrainTwo,
  460. VersionID: jobResult.VersionID,
  461. VersionName: jobResult.VersionName,
  462. Uuid: req.Uuid,
  463. DatasetName: req.DatasetName,
  464. CommitID: req.CommitID,
  465. IsLatestVersion: req.IsLatestVersion,
  466. PreVersionName: req.PreVersionName,
  467. ComputeResource: models.NPUResource,
  468. EngineID: req.EngineID,
  469. TrainUrl: req.TrainUrl,
  470. BranchName: req.BranchName,
  471. Parameters: req.Params,
  472. BootFile: req.BootFile,
  473. DataUrl: req.DataUrl,
  474. LogUrl: req.LogUrl,
  475. PreVersionId: req.PreVersionId,
  476. FlavorCode: req.Spec.SourceSpecId,
  477. Description: req.Description,
  478. WorkServerNumber: req.WorkServerNumber,
  479. FlavorName: req.FlavorName,
  480. EngineName: req.EngineName,
  481. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  482. VersionCount: VersionListCount + 1,
  483. CreatedUnix: createTime,
  484. UpdatedUnix: createTime,
  485. Spec: req.Spec,
  486. ModelName: req.ModelName,
  487. ModelVersion: req.ModelVersion,
  488. LabelName: req.LabelName,
  489. PreTrainModelUrl: req.PreTrainModelUrl,
  490. CkptName: req.CkptName,
  491. })
  492. if createErr != nil {
  493. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  494. return createErr
  495. }
  496. //将训练任务的上一版本的isLatestVersion设置为"0"
  497. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  498. if createErr != nil {
  499. ctx.ServerError("Update IsLatestVersion failed", createErr)
  500. return createErr
  501. }
  502. return createErr
  503. }
  504. func TransTrainJobStatus(status int) string {
  505. switch status {
  506. case 0:
  507. return "UNKNOWN"
  508. case 1:
  509. return "INIT"
  510. case 2:
  511. return "IMAGE_CREATING"
  512. case 3:
  513. return "IMAGE_FAILED"
  514. case 4:
  515. return "SUBMIT_TRYING"
  516. case 5:
  517. return "SUBMIT_FAILED"
  518. case 6:
  519. return "DELETE_FAILED"
  520. case 7:
  521. return "WAITING"
  522. case 8:
  523. return "RUNNING"
  524. case 9:
  525. return "KILLING"
  526. case 10:
  527. return "COMPLETED"
  528. case 11:
  529. return "FAILED"
  530. case 12:
  531. return "KILLED"
  532. case 13:
  533. return "CANCELED"
  534. case 14:
  535. return "LOST"
  536. case 15:
  537. return "SCALING"
  538. case 16:
  539. return "SUBMIT_MODEL_FAILED"
  540. case 17:
  541. return "DEPLOY_SERVICE_FAILED"
  542. case 18:
  543. return "CHECK_INIT"
  544. case 19:
  545. return "CHECK_RUNNING"
  546. case 20:
  547. return "CHECK_RUNNING_COMPLETED"
  548. case 21:
  549. return "CHECK_FAILED"
  550. default:
  551. return strconv.Itoa(status)
  552. }
  553. }
  554. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  555. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  556. VersionOutputPath = "V" + talVersionCountToString
  557. return VersionOutputPath
  558. }
  559. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  560. createTime := timeutil.TimeStampNow()
  561. var jobResult *models.CreateTrainJobResult
  562. var createErr error
  563. if req.EngineID < 0 {
  564. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  565. JobName: req.JobName,
  566. Description: req.Description,
  567. Config: models.InfUserImageConfig{
  568. WorkServerNum: req.WorkServerNumber,
  569. AppUrl: req.CodeObsPath,
  570. BootFileUrl: req.BootFileUrl,
  571. DataUrl: req.DataUrl,
  572. // TrainUrl: req.TrainUrl,
  573. LogUrl: req.LogUrl,
  574. PoolID: req.PoolID,
  575. CreateVersion: true,
  576. Flavor: models.Flavor{
  577. Code: req.Spec.SourceSpecId,
  578. },
  579. Parameter: req.Parameters,
  580. UserImageUrl: req.UserImageUrl,
  581. UserCommand: req.UserCommand,
  582. },
  583. })
  584. } else {
  585. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  586. JobName: req.JobName,
  587. Description: req.Description,
  588. InfConfig: models.InfConfig{
  589. WorkServerNum: req.WorkServerNumber,
  590. AppUrl: req.CodeObsPath,
  591. BootFileUrl: req.BootFileUrl,
  592. DataUrl: req.DataUrl,
  593. EngineID: req.EngineID,
  594. // TrainUrl: req.TrainUrl,
  595. LogUrl: req.LogUrl,
  596. PoolID: req.PoolID,
  597. CreateVersion: true,
  598. Flavor: models.Flavor{
  599. Code: req.Spec.SourceSpecId,
  600. },
  601. Parameter: req.Parameters,
  602. },
  603. })
  604. }
  605. if createErr != nil {
  606. log.Error("createInferenceJob failed: %v", err.Error())
  607. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  608. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  609. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  610. JobID: models.TempJobId,
  611. VersionID: models.TempVersionId,
  612. Status: models.TempJobStatus,
  613. Type: models.TypeCloudBrainTwo,
  614. JobName: req.JobName,
  615. JobType: req.JobType,
  616. })
  617. if err != nil {
  618. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  619. return "", err
  620. }
  621. }
  622. return "", err
  623. }
  624. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  625. // if err != nil {
  626. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  627. // return err
  628. // }
  629. jobID := strconv.FormatInt(jobResult.JobID, 10)
  630. err = models.CreateCloudbrain(&models.Cloudbrain{
  631. Status: TransTrainJobStatus(jobResult.Status),
  632. UserID: ctx.User.ID,
  633. RepoID: ctx.Repo.Repository.ID,
  634. JobID: jobID,
  635. JobName: req.JobName,
  636. DisplayJobName: req.DisplayJobName,
  637. JobType: req.JobType,
  638. Type: models.TypeCloudBrainTwo,
  639. VersionID: jobResult.VersionID,
  640. VersionName: jobResult.VersionName,
  641. Uuid: req.Uuid,
  642. DatasetName: req.DatasetName,
  643. CommitID: req.CommitID,
  644. EngineID: req.EngineID,
  645. TrainUrl: req.TrainUrl,
  646. BranchName: req.BranchName,
  647. Parameters: req.Params,
  648. BootFile: req.BootFile,
  649. DataUrl: req.DataUrl,
  650. LogUrl: req.LogUrl,
  651. FlavorCode: req.Spec.SourceSpecId,
  652. Description: req.Description,
  653. WorkServerNumber: req.WorkServerNumber,
  654. FlavorName: req.FlavorName,
  655. EngineName: req.EngineName,
  656. LabelName: req.LabelName,
  657. IsLatestVersion: req.IsLatestVersion,
  658. ComputeResource: models.NPUResource,
  659. VersionCount: req.VersionCount,
  660. TotalVersionCount: req.TotalVersionCount,
  661. ModelName: req.ModelName,
  662. ModelVersion: req.ModelVersion,
  663. CkptName: req.CkptName,
  664. ResultUrl: req.ResultUrl,
  665. CreatedUnix: createTime,
  666. UpdatedUnix: createTime,
  667. Spec: req.Spec,
  668. })
  669. if err != nil {
  670. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  671. return "", err
  672. }
  673. if req.JobType == string(models.JobTypeModelSafety) {
  674. task, err := models.GetCloudbrainByJobID(jobID)
  675. if err == nil {
  676. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  677. }
  678. } else {
  679. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  680. }
  681. return jobID, nil
  682. }
  683. func GetNotebookImageName(imageId string) (string, error) {
  684. var validImage = false
  685. var imageName = ""
  686. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  687. if imageInfo.Id == imageId {
  688. validImage = true
  689. imageName = imageInfo.Value
  690. }
  691. }
  692. if !validImage {
  693. log.Error("the image id(%s) is invalid", imageId)
  694. return imageName, errors.New("the image id is invalid")
  695. }
  696. return imageName, nil
  697. }
  698. func InitSpecialPool() {
  699. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  700. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  701. }
  702. }
  703. func InitMultiNode() {
  704. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  705. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  706. }
  707. }
  708. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  709. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  710. if err != nil {
  711. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  712. return err
  713. }
  714. if result != nil {
  715. oldStatus := task.Status
  716. task.Status = TransTrainJobStatus(result.IntStatus)
  717. task.Duration = result.Duration / 1000
  718. task.TrainJobDuration = result.TrainJobDuration
  719. if task.StartTime == 0 && result.StartTime > 0 {
  720. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  721. }
  722. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  723. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  724. task.EndTime = task.StartTime.Add(task.Duration)
  725. }
  726. task.CorrectCreateUnix()
  727. if oldStatus != task.Status {
  728. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  729. }
  730. err = models.UpdateJob(task)
  731. if err != nil {
  732. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  733. return err
  734. }
  735. }
  736. return nil
  737. }
  738. func HandleNotebookInfo(task *models.Cloudbrain) error {
  739. var result *models.GetNotebook2Result
  740. var err error
  741. if task.Type == models.TypeCloudBrainTwo {
  742. result, err = GetNotebook2(task.JobID)
  743. } else if task.Type == models.TypeCDCenter {
  744. result, err = modelarts_cd.GetNotebook(task.JobID)
  745. }
  746. if err != nil {
  747. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  748. return err
  749. }
  750. if result != nil {
  751. oldStatus := task.Status
  752. task.Status = result.Status
  753. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  754. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  755. }
  756. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  757. task.EndTime = timeutil.TimeStampNow()
  758. }
  759. task.CorrectCreateUnix()
  760. task.ComputeAndSetDuration()
  761. if oldStatus != task.Status {
  762. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  763. }
  764. if task.FlavorCode == "" {
  765. task.FlavorCode = result.Flavor
  766. }
  767. err = models.UpdateJob(task)
  768. if err != nil {
  769. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  770. return err
  771. }
  772. }
  773. return nil
  774. }
  775. func SyncTempStatusJob() {
  776. jobs, err := models.GetCloudBrainTempJobs()
  777. if err != nil {
  778. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  779. return
  780. }
  781. for _, temp := range jobs {
  782. log.Info("start to handle record: %s", temp.JobName)
  783. if temp.Type == models.TypeCloudBrainTwo {
  784. if temp.JobType == string(models.JobTypeDebug) {
  785. err = handleNotebook(temp)
  786. if err != nil {
  787. log.Error("handleNotebook falied:%v", err)
  788. break
  789. }
  790. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  791. _, err = models.GetCloudbrainByJobID(temp.JobID)
  792. if err != nil {
  793. //one version
  794. err = handleTrainJob(temp)
  795. if err != nil {
  796. log.Error("handleTrainJob falied:%v", err)
  797. break
  798. }
  799. } else {
  800. //multi version
  801. err = handleTrainJobMultiVersion(temp)
  802. if err != nil {
  803. log.Error("handleTrainJobMultiVersion falied:%v", err)
  804. break
  805. }
  806. }
  807. }
  808. }
  809. }
  810. return
  811. }
  812. func handleNotebook(temp *models.CloudbrainTemp) error {
  813. if temp.Status == models.TempJobStatus {
  814. err := handleTempNotebook(temp)
  815. if err != nil {
  816. log.Error("handleTempNotebook failed:%v", err)
  817. return err
  818. }
  819. } else if temp.Status == string(models.ModelArtsStopping) {
  820. res, err := GetNotebook2(temp.JobID)
  821. if err != nil {
  822. log.Error("GetNotebook2 failed:%v", err)
  823. return err
  824. }
  825. temp.Status = res.Status
  826. if temp.Status == string(models.ModelArtsStopped) {
  827. err = models.UpdateCloudbrainTemp(temp)
  828. if err != nil {
  829. log.Error("UpdateCloudbrainTemp failed:%v", err)
  830. return err
  831. }
  832. _, err := DelNotebook2(temp.JobID)
  833. if err != nil {
  834. log.Error("DelNotebook2 failed:%v", err)
  835. return err
  836. }
  837. temp.Status = string(models.ModelArtsDeleted)
  838. err = models.UpdateCloudbrainTemp(temp)
  839. if err != nil {
  840. log.Error("UpdateCloudbrainTemp failed:%v", err)
  841. return err
  842. }
  843. }
  844. }
  845. return nil
  846. }
  847. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  848. var err error
  849. var isExist bool
  850. for {
  851. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  852. if err != nil {
  853. log.Error("GetNotebookList failed:%v", err)
  854. break
  855. }
  856. temp.QueryTimes++
  857. err = models.UpdateCloudbrainTemp(temp)
  858. if err != nil {
  859. log.Error("UpdateCloudbrainTemp failed:%v", err)
  860. }
  861. if result != nil {
  862. for _, notebook := range result.NotebookList {
  863. if temp.JobID == models.TempJobId {
  864. //new notebook
  865. if notebook.JobName == temp.JobName {
  866. isExist = true
  867. temp.Status = notebook.Status
  868. temp.JobID = notebook.JobID
  869. break
  870. }
  871. } else {
  872. //restart: always can find one record
  873. if notebook.JobName == temp.JobName {
  874. if notebook.Status != string(models.ModelArtsStopped) {
  875. isExist = true
  876. temp.Status = notebook.Status
  877. temp.JobID = notebook.JobID
  878. break
  879. }
  880. }
  881. }
  882. }
  883. if isExist {
  884. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  885. if temp.Status == string(models.ModelArtsCreateFailed) {
  886. err = models.UpdateCloudbrainTemp(temp)
  887. if err != nil {
  888. log.Error("UpdateCloudbrainTemp failed:%v", err)
  889. break
  890. }
  891. _, err := DelNotebook2(temp.JobID)
  892. if err != nil {
  893. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  894. break
  895. }
  896. temp.Status = string(models.ModelArtsDeleted)
  897. } else {
  898. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  899. if err != nil {
  900. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  901. break
  902. }
  903. temp.Status = string(models.ModelArtsStopping)
  904. }
  905. models.UpdateCloudbrainTemp(temp)
  906. } else {
  907. log.Error("can not find the record(%s) till now", temp.JobName)
  908. err = errors.New("not found")
  909. break
  910. }
  911. } else {
  912. log.Error("can not find the record(%s) till now", temp.JobName)
  913. err = errors.New("not found")
  914. break
  915. }
  916. break
  917. }
  918. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  919. log.Info("reach MaxTempQueryTimes, set the job failed")
  920. temp.Status = string(models.ModelArtsTrainJobFailed)
  921. err = models.UpdateCloudbrainTemp(temp)
  922. if err != nil {
  923. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  924. return err
  925. }
  926. }
  927. return err
  928. }
  929. func handleTrainJob(temp *models.CloudbrainTemp) error {
  930. if temp.Status == models.TempJobStatus {
  931. err := handleTempTrainJob(temp)
  932. if err != nil {
  933. log.Error("handleTempTrainJob failed:%v", err)
  934. return err
  935. }
  936. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  937. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  938. if err != nil {
  939. log.Error("GetTrainJob failed:%v", err)
  940. return err
  941. }
  942. temp.Status = TransTrainJobStatus(res.IntStatus)
  943. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  944. err = models.UpdateCloudbrainTemp(temp)
  945. if err != nil {
  946. log.Error("UpdateCloudbrainTemp failed:%v", err)
  947. return err
  948. }
  949. _, err := DelTrainJob(temp.JobID)
  950. if err != nil {
  951. log.Error("DelTrainJob failed:%v", err)
  952. return err
  953. }
  954. temp.Status = string(models.ModelArtsDeleted)
  955. err = models.UpdateCloudbrainTemp(temp)
  956. if err != nil {
  957. log.Error("UpdateCloudbrainTemp failed:%v", err)
  958. return err
  959. }
  960. }
  961. }
  962. return nil
  963. }
  964. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  965. if temp.Status == models.TempJobStatus {
  966. err := handleTempTrainJobMultiVersion(temp)
  967. if err != nil {
  968. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  969. return err
  970. }
  971. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  972. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  973. if err != nil {
  974. log.Error("GetTrainJob failed:%v", err)
  975. return err
  976. }
  977. temp.Status = TransTrainJobStatus(res.IntStatus)
  978. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  979. err = models.UpdateCloudbrainTemp(temp)
  980. if err != nil {
  981. log.Error("UpdateCloudbrainTemp failed:%v", err)
  982. return err
  983. }
  984. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  985. if err != nil {
  986. log.Error("DelTrainJob failed:%v", err)
  987. return err
  988. }
  989. temp.Status = string(models.ModelArtsDeleted)
  990. err = models.UpdateCloudbrainTemp(temp)
  991. if err != nil {
  992. log.Error("UpdateCloudbrainTemp failed:%v", err)
  993. return err
  994. }
  995. }
  996. }
  997. return nil
  998. }
  999. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1000. var err error
  1001. var isExist bool
  1002. for {
  1003. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1004. if err != nil {
  1005. log.Error("GetTrainJobVersionList failed:%v", err)
  1006. break
  1007. }
  1008. temp.QueryTimes++
  1009. err = models.UpdateCloudbrainTemp(temp)
  1010. if err != nil {
  1011. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1012. }
  1013. if result != nil {
  1014. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1015. if result.VersionCount == int64(count+1) {
  1016. isExist = true
  1017. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1018. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1019. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1020. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1021. if err != nil {
  1022. log.Error("StopTrainJob failed:%v", err)
  1023. break
  1024. }
  1025. temp.Status = string(models.ModelArtsTrainJobKilling)
  1026. err = models.UpdateCloudbrainTemp(temp)
  1027. if err != nil {
  1028. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1029. break
  1030. }
  1031. } else {
  1032. log.Error("can not find the record(%s) till now", temp.JobName)
  1033. err = errors.New("not found")
  1034. break
  1035. }
  1036. }
  1037. break
  1038. }
  1039. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1040. log.Info("reach MaxTempQueryTimes, set the job failed")
  1041. temp.Status = string(models.ModelArtsTrainJobFailed)
  1042. err = models.UpdateCloudbrainTemp(temp)
  1043. if err != nil {
  1044. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1045. return err
  1046. }
  1047. }
  1048. return err
  1049. }
  1050. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1051. var err error
  1052. var isExist bool
  1053. for {
  1054. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1055. if err != nil {
  1056. log.Error("GetTrainJobList failed:%v", err)
  1057. break
  1058. }
  1059. temp.QueryTimes++
  1060. err = models.UpdateCloudbrainTemp(temp)
  1061. if err != nil {
  1062. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1063. }
  1064. if result != nil {
  1065. for _, job := range result.JobList {
  1066. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1067. isExist = true
  1068. temp.Status = TransTrainJobStatus(job.IntStatus)
  1069. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1070. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1071. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1072. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1073. if err != nil {
  1074. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1075. break
  1076. }
  1077. temp.Status = string(models.ModelArtsTrainJobKilling)
  1078. err = models.UpdateCloudbrainTemp(temp)
  1079. if err != nil {
  1080. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1081. break
  1082. }
  1083. }
  1084. }
  1085. if !isExist {
  1086. log.Error("can not find the record(%s) till now", temp.JobName)
  1087. err = errors.New("not found")
  1088. break
  1089. }
  1090. }
  1091. break
  1092. }
  1093. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1094. log.Info("reach MaxTempQueryTimes, set the job failed")
  1095. temp.Status = string(models.ModelArtsTrainJobFailed)
  1096. err = models.UpdateCloudbrainTemp(temp)
  1097. if err != nil {
  1098. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1099. return err
  1100. }
  1101. }
  1102. return err
  1103. }