You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 38 kB

4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/modules/modelarts_cd"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/storage"
  16. "code.gitea.io/gitea/modules/timeutil"
  17. )
  18. const (
  19. //notebook
  20. storageTypeOBS = "obs"
  21. autoStopDuration = 4 * 60 * 60
  22. autoStopDurationMs = 4 * 60 * 60 * 1000
  23. MORDELART_USER_IMAGE_ENGINE_ID = -1
  24. DataSetMountPath = "/home/ma-user/work"
  25. NotebookEnv = "Python3"
  26. NotebookType = "Ascend"
  27. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  28. //train-job
  29. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  30. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  31. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  35. // "]}"
  36. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  40. // "]}"
  41. CodePath = "/code/"
  42. OutputPath = "/output/"
  43. ResultPath = "/result/"
  44. LogPath = "/log/"
  45. JobPath = "/job/"
  46. OrderDesc = "desc" //向下查询
  47. OrderAsc = "asc" //向上查询
  48. Lines = 500
  49. TrainUrl = "train_url"
  50. DataUrl = "data_url"
  51. MultiDataUrl = "multi_data_url"
  52. ResultUrl = "result_url"
  53. CkptUrl = "ckpt_url"
  54. DeviceTarget = "device_target"
  55. Ascend = "Ascend"
  56. PerPage = 10
  57. IsLatestVersion = "1"
  58. NotLatestVersion = "0"
  59. VersionCountOne = 1
  60. SortByCreateTime = "create_time"
  61. ConfigTypeCustom = "custom"
  62. TotalVersionCount = 1
  63. VolumeID = "c6a73891-6a19-4a6a-a2e8-0d6baa72a7c5"
  64. VolumeSourcePath = "192.168.0.30:/"
  65. VolumeDestPath = "/cache/sfs"
  66. )
  67. var (
  68. poolInfos *models.PoolInfos
  69. TrainFlavorInfos *Flavor
  70. SpecialPools *models.SpecialPools
  71. MultiNodeConfig *MultiNodes
  72. )
  73. type GenerateTrainJobReq struct {
  74. JobName string
  75. DisplayJobName string
  76. Uuid string
  77. Description string
  78. CodeObsPath string
  79. BootFile string
  80. BootFileUrl string
  81. DataUrl string
  82. TrainUrl string
  83. LogUrl string
  84. PoolID string
  85. WorkServerNumber int
  86. EngineID int64
  87. Parameters []models.Parameter
  88. CommitID string
  89. IsLatestVersion string
  90. Params string
  91. BranchName string
  92. PreVersionId int64
  93. PreVersionName string
  94. FlavorCode string
  95. FlavorName string
  96. VersionCount int
  97. EngineName string
  98. TotalVersionCount int
  99. UserImageUrl string
  100. UserCommand string
  101. DatasetName string
  102. Spec *models.Specification
  103. ModelName string
  104. LabelName string
  105. CkptName string
  106. ModelVersion string
  107. PreTrainModelUrl string
  108. }
  109. type GenerateInferenceJobReq struct {
  110. JobName string
  111. DisplayJobName string
  112. Uuid string
  113. Description string
  114. CodeObsPath string
  115. BootFile string
  116. BootFileUrl string
  117. DataUrl string
  118. TrainUrl string
  119. LogUrl string
  120. PoolID string
  121. WorkServerNumber int
  122. EngineID int64
  123. Parameters []models.Parameter
  124. CommitID string
  125. Params string
  126. BranchName string
  127. FlavorName string
  128. EngineName string
  129. LabelName string
  130. IsLatestVersion string
  131. VersionCount int
  132. TotalVersionCount int
  133. ModelName string
  134. ModelVersion string
  135. CkptName string
  136. ResultUrl string
  137. Spec *models.Specification
  138. DatasetName string
  139. JobType string
  140. UserImageUrl string
  141. UserCommand string
  142. }
  143. type VersionInfo struct {
  144. Version []struct {
  145. ID int `json:"id"`
  146. Value string `json:"value"`
  147. Url string `json:"url"`
  148. } `json:"version"`
  149. }
  150. type Flavor struct {
  151. Info []struct {
  152. Code string `json:"code"`
  153. Value string `json:"value"`
  154. UnitPrice int64 `json:"unitPrice"`
  155. } `json:"flavor"`
  156. }
  157. type Engine struct {
  158. Info []struct {
  159. ID int `json:"id"`
  160. Value string `json:"value"`
  161. } `json:"engine"`
  162. }
  163. type ResourcePool struct {
  164. Info []struct {
  165. ID string `json:"id"`
  166. Value string `json:"value"`
  167. } `json:"resource_pool"`
  168. }
  169. type MultiNodes struct {
  170. Info []OrgMultiNode `json:"multinode"`
  171. }
  172. type OrgMultiNode struct {
  173. Org string `json:"org"`
  174. Node []int `json:"node"`
  175. }
  176. // type Parameter struct {
  177. // Label string `json:"label"`
  178. // Value string `json:"value"`
  179. // }
  180. // type Parameters struct {
  181. // Parameter []Parameter `json:"parameter"`
  182. // }
  183. type Parameters struct {
  184. Parameter []struct {
  185. Label string `json:"label"`
  186. Value string `json:"value"`
  187. } `json:"parameter"`
  188. }
  189. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  190. var dataActualPath string
  191. if uuid != "" {
  192. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  193. } else {
  194. userPath := setting.UserBasePath + ctx.User.Name + "/"
  195. isExist, err := storage.ObsHasObject(userPath)
  196. if err != nil {
  197. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  198. return err
  199. }
  200. if !isExist {
  201. if err = storage.ObsCreateObject(userPath); err != nil {
  202. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  203. return err
  204. }
  205. }
  206. dataActualPath = setting.Bucket + "/" + userPath
  207. }
  208. if poolInfos == nil {
  209. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  210. }
  211. createTime := timeutil.TimeStampNow()
  212. jobResult, err := CreateJob(models.CreateNotebookParams{
  213. JobName: jobName,
  214. Description: description,
  215. ProfileID: setting.ProfileID,
  216. Flavor: flavor,
  217. Pool: models.Pool{
  218. ID: poolInfos.PoolInfo[0].PoolId,
  219. Name: poolInfos.PoolInfo[0].PoolName,
  220. Type: poolInfos.PoolInfo[0].PoolType,
  221. },
  222. Spec: models.Spec{
  223. Storage: models.Storage{
  224. Type: storageTypeOBS,
  225. Location: models.Location{
  226. Path: dataActualPath,
  227. },
  228. },
  229. AutoStop: models.AutoStop{
  230. Enable: true,
  231. Duration: autoStopDuration,
  232. },
  233. },
  234. })
  235. if err != nil {
  236. log.Error("CreateJob failed: %v", err.Error())
  237. return err
  238. }
  239. err = models.CreateCloudbrain(&models.Cloudbrain{
  240. Status: string(models.JobWaiting),
  241. UserID: ctx.User.ID,
  242. RepoID: ctx.Repo.Repository.ID,
  243. JobID: jobResult.ID,
  244. JobName: jobName,
  245. JobType: string(models.JobTypeDebug),
  246. Type: models.TypeCloudBrainTwo,
  247. Uuid: uuid,
  248. ComputeResource: models.NPUResource,
  249. CreatedUnix: createTime,
  250. UpdatedUnix: createTime,
  251. })
  252. if err != nil {
  253. return err
  254. }
  255. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  256. return nil
  257. }
  258. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  259. if poolInfos == nil {
  260. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  261. }
  262. imageName, err := GetNotebookImageName(imageId)
  263. if err != nil {
  264. log.Error("GetNotebookImageName failed: %v", err.Error())
  265. return err
  266. }
  267. createTime := timeutil.TimeStampNow()
  268. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  269. JobName: jobName,
  270. Description: description,
  271. Flavor: spec.SourceSpecId,
  272. Duration: autoStopDurationMs,
  273. ImageID: imageId,
  274. PoolID: poolInfos.PoolInfo[0].PoolId,
  275. Feature: models.NotebookFeature,
  276. Volume: models.VolumeReq{
  277. Capacity: setting.Capacity,
  278. Category: models.EVSCategory,
  279. Ownership: models.ManagedOwnership,
  280. },
  281. WorkspaceID: "0",
  282. })
  283. if err != nil {
  284. log.Error("createNotebook2 failed: %v", err.Error())
  285. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  286. log.Info("(%s)unknown error, set temp status", displayJobName)
  287. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  288. JobID: models.TempJobId,
  289. VersionID: models.TempVersionId,
  290. Status: models.TempJobStatus,
  291. Type: models.TypeCloudBrainTwo,
  292. JobName: jobName,
  293. JobType: string(models.JobTypeDebug),
  294. })
  295. if errTemp != nil {
  296. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  297. return errTemp
  298. }
  299. }
  300. return err
  301. }
  302. task := &models.Cloudbrain{
  303. Status: jobResult.Status,
  304. UserID: ctx.User.ID,
  305. RepoID: ctx.Repo.Repository.ID,
  306. JobID: jobResult.ID,
  307. JobName: jobName,
  308. FlavorCode: spec.SourceSpecId,
  309. DisplayJobName: displayJobName,
  310. JobType: string(models.JobTypeDebug),
  311. Type: models.TypeCloudBrainTwo,
  312. Uuid: uuid,
  313. ComputeResource: models.NPUResource,
  314. Image: imageName,
  315. Description: description,
  316. CreatedUnix: createTime,
  317. UpdatedUnix: createTime,
  318. Spec: spec,
  319. }
  320. err = models.CreateCloudbrain(task)
  321. if err != nil {
  322. return err
  323. }
  324. stringId := strconv.FormatInt(task.ID, 10)
  325. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  326. return nil
  327. }
  328. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  329. createTime := timeutil.TimeStampNow()
  330. var jobResult *models.CreateTrainJobResult
  331. var createErr error
  332. if req.EngineID < 0 {
  333. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  334. JobName: req.JobName,
  335. Description: req.Description,
  336. Config: models.UserImageConfig{
  337. WorkServerNum: req.WorkServerNumber,
  338. AppUrl: req.CodeObsPath,
  339. BootFileUrl: req.BootFileUrl,
  340. DataUrl: req.DataUrl,
  341. TrainUrl: req.TrainUrl,
  342. LogUrl: req.LogUrl,
  343. PoolID: req.PoolID,
  344. CreateVersion: true,
  345. Flavor: models.Flavor{
  346. Code: req.Spec.SourceSpecId,
  347. },
  348. Parameter: req.Parameters,
  349. UserImageUrl: req.UserImageUrl,
  350. UserCommand: req.UserCommand,
  351. Volumes: []models.Volumes{
  352. {
  353. Nfs: models.Nfs{
  354. ID: VolumeID,
  355. SourcePath: VolumeSourcePath,
  356. DestPath: VolumeDestPath,
  357. ReadOnly: false,
  358. },
  359. },
  360. },
  361. },
  362. })
  363. } else {
  364. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  365. JobName: req.JobName,
  366. Description: req.Description,
  367. Config: models.Config{
  368. WorkServerNum: req.WorkServerNumber,
  369. AppUrl: req.CodeObsPath,
  370. BootFileUrl: req.BootFileUrl,
  371. DataUrl: req.DataUrl,
  372. EngineID: req.EngineID,
  373. TrainUrl: req.TrainUrl,
  374. LogUrl: req.LogUrl,
  375. PoolID: req.PoolID,
  376. CreateVersion: true,
  377. Flavor: models.Flavor{
  378. Code: req.Spec.SourceSpecId,
  379. },
  380. Parameter: req.Parameters,
  381. Volumes: []models.Volumes{
  382. {
  383. Nfs: models.Nfs{
  384. ID: VolumeID,
  385. SourcePath: VolumeSourcePath,
  386. DestPath: VolumeDestPath,
  387. ReadOnly: false,
  388. },
  389. },
  390. },
  391. },
  392. })
  393. }
  394. if createErr != nil {
  395. log.Error("createTrainJob failed: %v", createErr.Error())
  396. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  397. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  398. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  399. JobID: models.TempJobId,
  400. VersionID: models.TempVersionId,
  401. Status: models.TempJobStatus,
  402. Type: models.TypeCloudBrainTwo,
  403. JobName: req.JobName,
  404. JobType: string(models.JobTypeTrain),
  405. })
  406. if errTemp != nil {
  407. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  408. return "", errTemp
  409. }
  410. }
  411. return "", createErr
  412. }
  413. jobID := strconv.FormatInt(jobResult.JobID, 10)
  414. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  415. Status: TransTrainJobStatus(jobResult.Status),
  416. UserID: ctx.User.ID,
  417. RepoID: ctx.Repo.Repository.ID,
  418. JobID: jobID,
  419. JobName: req.JobName,
  420. DisplayJobName: req.DisplayJobName,
  421. JobType: string(models.JobTypeTrain),
  422. Type: models.TypeCloudBrainTwo,
  423. VersionID: jobResult.VersionID,
  424. VersionName: jobResult.VersionName,
  425. Uuid: req.Uuid,
  426. DatasetName: req.DatasetName,
  427. CommitID: req.CommitID,
  428. IsLatestVersion: req.IsLatestVersion,
  429. ComputeResource: models.NPUResource,
  430. EngineID: req.EngineID,
  431. TrainUrl: req.TrainUrl,
  432. BranchName: req.BranchName,
  433. Parameters: req.Params,
  434. BootFile: req.BootFile,
  435. DataUrl: req.DataUrl,
  436. LogUrl: req.LogUrl,
  437. FlavorCode: req.Spec.SourceSpecId,
  438. Description: req.Description,
  439. WorkServerNumber: req.WorkServerNumber,
  440. FlavorName: req.FlavorName,
  441. EngineName: req.EngineName,
  442. VersionCount: req.VersionCount,
  443. TotalVersionCount: req.TotalVersionCount,
  444. CreatedUnix: createTime,
  445. UpdatedUnix: createTime,
  446. Spec: req.Spec,
  447. ModelName: req.ModelName,
  448. ModelVersion: req.ModelVersion,
  449. LabelName: req.LabelName,
  450. PreTrainModelUrl: req.PreTrainModelUrl,
  451. CkptName: req.CkptName,
  452. })
  453. if createErr != nil {
  454. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  455. return "", createErr
  456. }
  457. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  458. return jobID, nil
  459. }
  460. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  461. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  462. JobName: req.JobName,
  463. Description: req.Description,
  464. Config: models.UserImageConfig{
  465. WorkServerNum: req.WorkServerNumber,
  466. AppUrl: req.CodeObsPath,
  467. BootFileUrl: req.BootFileUrl,
  468. DataUrl: req.DataUrl,
  469. TrainUrl: req.TrainUrl,
  470. LogUrl: req.LogUrl,
  471. PoolID: req.PoolID,
  472. CreateVersion: true,
  473. Flavor: models.Flavor{
  474. Code: req.FlavorCode,
  475. },
  476. Parameter: req.Parameters,
  477. UserImageUrl: req.UserImageUrl,
  478. UserCommand: req.UserCommand,
  479. },
  480. })
  481. }
  482. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  483. createTime := timeutil.TimeStampNow()
  484. var jobResult *models.CreateTrainJobResult
  485. var createErr error
  486. if req.EngineID < 0 {
  487. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  488. Description: req.Description,
  489. Config: models.TrainJobVersionUserImageConfig{
  490. WorkServerNum: req.WorkServerNumber,
  491. AppUrl: req.CodeObsPath,
  492. BootFileUrl: req.BootFileUrl,
  493. DataUrl: req.DataUrl,
  494. TrainUrl: req.TrainUrl,
  495. LogUrl: req.LogUrl,
  496. PoolID: req.PoolID,
  497. Flavor: models.Flavor{
  498. Code: req.Spec.SourceSpecId,
  499. },
  500. Parameter: req.Parameters,
  501. PreVersionId: req.PreVersionId,
  502. UserImageUrl: req.UserImageUrl,
  503. UserCommand: req.UserCommand,
  504. },
  505. }, jobId)
  506. } else {
  507. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  508. Description: req.Description,
  509. Config: models.TrainJobVersionConfig{
  510. WorkServerNum: req.WorkServerNumber,
  511. AppUrl: req.CodeObsPath,
  512. BootFileUrl: req.BootFileUrl,
  513. DataUrl: req.DataUrl,
  514. EngineID: req.EngineID,
  515. TrainUrl: req.TrainUrl,
  516. LogUrl: req.LogUrl,
  517. PoolID: req.PoolID,
  518. Flavor: models.Flavor{
  519. Code: req.Spec.SourceSpecId,
  520. },
  521. Parameter: req.Parameters,
  522. PreVersionId: req.PreVersionId,
  523. },
  524. }, jobId)
  525. }
  526. if createErr != nil {
  527. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  528. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  529. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  530. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  531. JobID: jobId,
  532. VersionID: models.TempVersionId,
  533. Status: models.TempJobStatus,
  534. Type: models.TypeCloudBrainTwo,
  535. JobName: req.JobName,
  536. JobType: string(models.JobTypeTrain),
  537. })
  538. if errTemp != nil {
  539. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  540. return errTemp
  541. }
  542. }
  543. return createErr
  544. }
  545. var jobTypes []string
  546. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  547. repo := ctx.Repo.Repository
  548. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  549. RepoID: repo.ID,
  550. Type: models.TypeCloudBrainTwo,
  551. JobTypes: jobTypes,
  552. JobID: strconv.FormatInt(jobResult.JobID, 10),
  553. })
  554. if createErr != nil {
  555. ctx.ServerError("Cloudbrain", createErr)
  556. return createErr
  557. }
  558. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  559. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  560. Status: TransTrainJobStatus(jobResult.Status),
  561. UserID: ctx.User.ID,
  562. RepoID: ctx.Repo.Repository.ID,
  563. JobID: strconv.FormatInt(jobResult.JobID, 10),
  564. JobName: req.JobName,
  565. DisplayJobName: req.DisplayJobName,
  566. JobType: string(models.JobTypeTrain),
  567. Type: models.TypeCloudBrainTwo,
  568. VersionID: jobResult.VersionID,
  569. VersionName: jobResult.VersionName,
  570. Uuid: req.Uuid,
  571. DatasetName: req.DatasetName,
  572. CommitID: req.CommitID,
  573. IsLatestVersion: req.IsLatestVersion,
  574. PreVersionName: req.PreVersionName,
  575. ComputeResource: models.NPUResource,
  576. EngineID: req.EngineID,
  577. TrainUrl: req.TrainUrl,
  578. BranchName: req.BranchName,
  579. Parameters: req.Params,
  580. BootFile: req.BootFile,
  581. DataUrl: req.DataUrl,
  582. LogUrl: req.LogUrl,
  583. PreVersionId: req.PreVersionId,
  584. FlavorCode: req.Spec.SourceSpecId,
  585. Description: req.Description,
  586. WorkServerNumber: req.WorkServerNumber,
  587. FlavorName: req.FlavorName,
  588. EngineName: req.EngineName,
  589. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  590. VersionCount: VersionListCount + 1,
  591. CreatedUnix: createTime,
  592. UpdatedUnix: createTime,
  593. Spec: req.Spec,
  594. ModelName: req.ModelName,
  595. ModelVersion: req.ModelVersion,
  596. LabelName: req.LabelName,
  597. PreTrainModelUrl: req.PreTrainModelUrl,
  598. CkptName: req.CkptName,
  599. })
  600. if createErr != nil {
  601. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  602. return createErr
  603. }
  604. //将训练任务的上一版本的isLatestVersion设置为"0"
  605. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  606. if createErr != nil {
  607. ctx.ServerError("Update IsLatestVersion failed", createErr)
  608. return createErr
  609. }
  610. return createErr
  611. }
  612. func TransTrainJobStatus(status int) string {
  613. switch status {
  614. case 0:
  615. return "UNKNOWN"
  616. case 1:
  617. return "INIT"
  618. case 2:
  619. return "IMAGE_CREATING"
  620. case 3:
  621. return "IMAGE_FAILED"
  622. case 4:
  623. return "SUBMIT_TRYING"
  624. case 5:
  625. return "SUBMIT_FAILED"
  626. case 6:
  627. return "DELETE_FAILED"
  628. case 7:
  629. return "WAITING"
  630. case 8:
  631. return "RUNNING"
  632. case 9:
  633. return "KILLING"
  634. case 10:
  635. return "COMPLETED"
  636. case 11:
  637. return "FAILED"
  638. case 12:
  639. return "KILLED"
  640. case 13:
  641. return "CANCELED"
  642. case 14:
  643. return "LOST"
  644. case 15:
  645. return "SCALING"
  646. case 16:
  647. return "SUBMIT_MODEL_FAILED"
  648. case 17:
  649. return "DEPLOY_SERVICE_FAILED"
  650. case 18:
  651. return "CHECK_INIT"
  652. case 19:
  653. return "CHECK_RUNNING"
  654. case 20:
  655. return "CHECK_RUNNING_COMPLETED"
  656. case 21:
  657. return "CHECK_FAILED"
  658. default:
  659. return strconv.Itoa(status)
  660. }
  661. }
  662. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  663. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  664. VersionOutputPath = "V" + talVersionCountToString
  665. return VersionOutputPath
  666. }
  667. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  668. createTime := timeutil.TimeStampNow()
  669. var jobResult *models.CreateTrainJobResult
  670. var createErr error
  671. if req.EngineID < 0 {
  672. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  673. JobName: req.JobName,
  674. Description: req.Description,
  675. Config: models.InfUserImageConfig{
  676. WorkServerNum: req.WorkServerNumber,
  677. AppUrl: req.CodeObsPath,
  678. BootFileUrl: req.BootFileUrl,
  679. DataUrl: req.DataUrl,
  680. // TrainUrl: req.TrainUrl,
  681. LogUrl: req.LogUrl,
  682. PoolID: req.PoolID,
  683. CreateVersion: true,
  684. Flavor: models.Flavor{
  685. Code: req.Spec.SourceSpecId,
  686. },
  687. Parameter: req.Parameters,
  688. UserImageUrl: req.UserImageUrl,
  689. UserCommand: req.UserCommand,
  690. },
  691. })
  692. } else {
  693. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  694. JobName: req.JobName,
  695. Description: req.Description,
  696. InfConfig: models.InfConfig{
  697. WorkServerNum: req.WorkServerNumber,
  698. AppUrl: req.CodeObsPath,
  699. BootFileUrl: req.BootFileUrl,
  700. DataUrl: req.DataUrl,
  701. EngineID: req.EngineID,
  702. // TrainUrl: req.TrainUrl,
  703. LogUrl: req.LogUrl,
  704. PoolID: req.PoolID,
  705. CreateVersion: true,
  706. Flavor: models.Flavor{
  707. Code: req.Spec.SourceSpecId,
  708. },
  709. Parameter: req.Parameters,
  710. },
  711. })
  712. }
  713. if createErr != nil {
  714. log.Error("createInferenceJob failed: %v", err.Error())
  715. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  716. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  717. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  718. JobID: models.TempJobId,
  719. VersionID: models.TempVersionId,
  720. Status: models.TempJobStatus,
  721. Type: models.TypeCloudBrainTwo,
  722. JobName: req.JobName,
  723. JobType: req.JobType,
  724. })
  725. if err != nil {
  726. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  727. return "", err
  728. }
  729. }
  730. return "", err
  731. }
  732. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  733. // if err != nil {
  734. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  735. // return err
  736. // }
  737. jobID := strconv.FormatInt(jobResult.JobID, 10)
  738. err = models.CreateCloudbrain(&models.Cloudbrain{
  739. Status: TransTrainJobStatus(jobResult.Status),
  740. UserID: ctx.User.ID,
  741. RepoID: ctx.Repo.Repository.ID,
  742. JobID: jobID,
  743. JobName: req.JobName,
  744. DisplayJobName: req.DisplayJobName,
  745. JobType: req.JobType,
  746. Type: models.TypeCloudBrainTwo,
  747. VersionID: jobResult.VersionID,
  748. VersionName: jobResult.VersionName,
  749. Uuid: req.Uuid,
  750. DatasetName: req.DatasetName,
  751. CommitID: req.CommitID,
  752. EngineID: req.EngineID,
  753. TrainUrl: req.TrainUrl,
  754. BranchName: req.BranchName,
  755. Parameters: req.Params,
  756. BootFile: req.BootFile,
  757. DataUrl: req.DataUrl,
  758. LogUrl: req.LogUrl,
  759. FlavorCode: req.Spec.SourceSpecId,
  760. Description: req.Description,
  761. WorkServerNumber: req.WorkServerNumber,
  762. FlavorName: req.FlavorName,
  763. EngineName: req.EngineName,
  764. LabelName: req.LabelName,
  765. IsLatestVersion: req.IsLatestVersion,
  766. ComputeResource: models.NPUResource,
  767. VersionCount: req.VersionCount,
  768. TotalVersionCount: req.TotalVersionCount,
  769. ModelName: req.ModelName,
  770. ModelVersion: req.ModelVersion,
  771. CkptName: req.CkptName,
  772. ResultUrl: req.ResultUrl,
  773. CreatedUnix: createTime,
  774. UpdatedUnix: createTime,
  775. Spec: req.Spec,
  776. })
  777. if err != nil {
  778. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  779. return "", err
  780. }
  781. if req.JobType == string(models.JobTypeModelSafety) {
  782. task, err := models.GetCloudbrainByJobID(jobID)
  783. if err == nil {
  784. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  785. }
  786. } else {
  787. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  788. }
  789. return jobID, nil
  790. }
  791. func GetNotebookImageName(imageId string) (string, error) {
  792. var validImage = false
  793. var imageName = ""
  794. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  795. if imageInfo.Id == imageId {
  796. validImage = true
  797. imageName = imageInfo.Value
  798. }
  799. }
  800. if !validImage {
  801. log.Error("the image id(%s) is invalid", imageId)
  802. return imageName, errors.New("the image id is invalid")
  803. }
  804. return imageName, nil
  805. }
  806. func InitSpecialPool() {
  807. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  808. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  809. }
  810. }
  811. func InitMultiNode() {
  812. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  813. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  814. }
  815. }
  816. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  817. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  818. if err != nil {
  819. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  820. return err
  821. }
  822. if result != nil {
  823. oldStatus := task.Status
  824. task.Status = TransTrainJobStatus(result.IntStatus)
  825. task.Duration = result.Duration / 1000
  826. task.TrainJobDuration = result.TrainJobDuration
  827. if task.StartTime == 0 && result.StartTime > 0 {
  828. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  829. }
  830. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  831. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  832. task.EndTime = task.StartTime.Add(task.Duration)
  833. }
  834. task.CorrectCreateUnix()
  835. if oldStatus != task.Status {
  836. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  837. }
  838. err = models.UpdateJob(task)
  839. if err != nil {
  840. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  841. return err
  842. }
  843. }
  844. return nil
  845. }
  846. func HandleNotebookInfo(task *models.Cloudbrain) error {
  847. var result *models.GetNotebook2Result
  848. var err error
  849. if task.Type == models.TypeCloudBrainTwo {
  850. result, err = GetNotebook2(task.JobID)
  851. } else if task.Type == models.TypeCDCenter {
  852. result, err = modelarts_cd.GetNotebook(task.JobID)
  853. }
  854. if err != nil {
  855. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  856. return err
  857. }
  858. if result != nil {
  859. oldStatus := task.Status
  860. task.Status = result.Status
  861. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  862. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  863. }
  864. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  865. task.EndTime = timeutil.TimeStampNow()
  866. }
  867. task.CorrectCreateUnix()
  868. task.ComputeAndSetDuration()
  869. if oldStatus != task.Status {
  870. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  871. }
  872. if task.FlavorCode == "" {
  873. task.FlavorCode = result.Flavor
  874. }
  875. err = models.UpdateJob(task)
  876. if err != nil {
  877. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  878. return err
  879. }
  880. }
  881. return nil
  882. }
  883. func SyncTempStatusJob() {
  884. jobs, err := models.GetCloudBrainTempJobs()
  885. if err != nil {
  886. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  887. return
  888. }
  889. for _, temp := range jobs {
  890. log.Info("start to handle record: %s", temp.JobName)
  891. if temp.Type == models.TypeCloudBrainTwo {
  892. if temp.JobType == string(models.JobTypeDebug) {
  893. err = handleNotebook(temp)
  894. if err != nil {
  895. log.Error("handleNotebook falied:%v", err)
  896. break
  897. }
  898. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  899. _, err = models.GetCloudbrainByJobID(temp.JobID)
  900. if err != nil {
  901. //one version
  902. err = handleTrainJob(temp)
  903. if err != nil {
  904. log.Error("handleTrainJob falied:%v", err)
  905. break
  906. }
  907. } else {
  908. //multi version
  909. err = handleTrainJobMultiVersion(temp)
  910. if err != nil {
  911. log.Error("handleTrainJobMultiVersion falied:%v", err)
  912. break
  913. }
  914. }
  915. }
  916. }
  917. }
  918. return
  919. }
  920. func handleNotebook(temp *models.CloudbrainTemp) error {
  921. if temp.Status == models.TempJobStatus {
  922. err := handleTempNotebook(temp)
  923. if err != nil {
  924. log.Error("handleTempNotebook failed:%v", err)
  925. return err
  926. }
  927. } else if temp.Status == string(models.ModelArtsStopping) {
  928. res, err := GetNotebook2(temp.JobID)
  929. if err != nil {
  930. log.Error("GetNotebook2 failed:%v", err)
  931. return err
  932. }
  933. temp.Status = res.Status
  934. if temp.Status == string(models.ModelArtsStopped) {
  935. err = models.UpdateCloudbrainTemp(temp)
  936. if err != nil {
  937. log.Error("UpdateCloudbrainTemp failed:%v", err)
  938. return err
  939. }
  940. _, err := DelNotebook2(temp.JobID)
  941. if err != nil {
  942. log.Error("DelNotebook2 failed:%v", err)
  943. return err
  944. }
  945. temp.Status = string(models.ModelArtsDeleted)
  946. err = models.UpdateCloudbrainTemp(temp)
  947. if err != nil {
  948. log.Error("UpdateCloudbrainTemp failed:%v", err)
  949. return err
  950. }
  951. }
  952. }
  953. return nil
  954. }
  955. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  956. var err error
  957. var isExist bool
  958. for {
  959. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  960. if err != nil {
  961. log.Error("GetNotebookList failed:%v", err)
  962. break
  963. }
  964. temp.QueryTimes++
  965. err = models.UpdateCloudbrainTemp(temp)
  966. if err != nil {
  967. log.Error("UpdateCloudbrainTemp failed:%v", err)
  968. }
  969. if result != nil {
  970. for _, notebook := range result.NotebookList {
  971. if temp.JobID == models.TempJobId {
  972. //new notebook
  973. if notebook.JobName == temp.JobName {
  974. isExist = true
  975. temp.Status = notebook.Status
  976. temp.JobID = notebook.JobID
  977. break
  978. }
  979. } else {
  980. //restart: always can find one record
  981. if notebook.JobName == temp.JobName {
  982. if notebook.Status != string(models.ModelArtsStopped) {
  983. isExist = true
  984. temp.Status = notebook.Status
  985. temp.JobID = notebook.JobID
  986. break
  987. }
  988. }
  989. }
  990. }
  991. if isExist {
  992. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  993. if temp.Status == string(models.ModelArtsCreateFailed) {
  994. err = models.UpdateCloudbrainTemp(temp)
  995. if err != nil {
  996. log.Error("UpdateCloudbrainTemp failed:%v", err)
  997. break
  998. }
  999. _, err := DelNotebook2(temp.JobID)
  1000. if err != nil {
  1001. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  1002. break
  1003. }
  1004. temp.Status = string(models.ModelArtsDeleted)
  1005. } else {
  1006. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  1007. if err != nil {
  1008. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  1009. break
  1010. }
  1011. temp.Status = string(models.ModelArtsStopping)
  1012. }
  1013. models.UpdateCloudbrainTemp(temp)
  1014. } else {
  1015. log.Error("can not find the record(%s) till now", temp.JobName)
  1016. err = errors.New("not found")
  1017. break
  1018. }
  1019. } else {
  1020. log.Error("can not find the record(%s) till now", temp.JobName)
  1021. err = errors.New("not found")
  1022. break
  1023. }
  1024. break
  1025. }
  1026. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1027. log.Info("reach MaxTempQueryTimes, set the job failed")
  1028. temp.Status = string(models.ModelArtsTrainJobFailed)
  1029. err = models.UpdateCloudbrainTemp(temp)
  1030. if err != nil {
  1031. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1032. return err
  1033. }
  1034. }
  1035. return err
  1036. }
  1037. func handleTrainJob(temp *models.CloudbrainTemp) error {
  1038. if temp.Status == models.TempJobStatus {
  1039. err := handleTempTrainJob(temp)
  1040. if err != nil {
  1041. log.Error("handleTempTrainJob failed:%v", err)
  1042. return err
  1043. }
  1044. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1045. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1046. if err != nil {
  1047. log.Error("GetTrainJob failed:%v", err)
  1048. return err
  1049. }
  1050. temp.Status = TransTrainJobStatus(res.IntStatus)
  1051. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1052. err = models.UpdateCloudbrainTemp(temp)
  1053. if err != nil {
  1054. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1055. return err
  1056. }
  1057. _, err := DelTrainJob(temp.JobID)
  1058. if err != nil {
  1059. log.Error("DelTrainJob failed:%v", err)
  1060. return err
  1061. }
  1062. temp.Status = string(models.ModelArtsDeleted)
  1063. err = models.UpdateCloudbrainTemp(temp)
  1064. if err != nil {
  1065. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1066. return err
  1067. }
  1068. }
  1069. }
  1070. return nil
  1071. }
  1072. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1073. if temp.Status == models.TempJobStatus {
  1074. err := handleTempTrainJobMultiVersion(temp)
  1075. if err != nil {
  1076. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1077. return err
  1078. }
  1079. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1080. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1081. if err != nil {
  1082. log.Error("GetTrainJob failed:%v", err)
  1083. return err
  1084. }
  1085. temp.Status = TransTrainJobStatus(res.IntStatus)
  1086. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1087. err = models.UpdateCloudbrainTemp(temp)
  1088. if err != nil {
  1089. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1090. return err
  1091. }
  1092. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1093. if err != nil {
  1094. log.Error("DelTrainJob failed:%v", err)
  1095. return err
  1096. }
  1097. temp.Status = string(models.ModelArtsDeleted)
  1098. err = models.UpdateCloudbrainTemp(temp)
  1099. if err != nil {
  1100. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1101. return err
  1102. }
  1103. }
  1104. }
  1105. return nil
  1106. }
  1107. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1108. var err error
  1109. var isExist bool
  1110. for {
  1111. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1112. if err != nil {
  1113. log.Error("GetTrainJobVersionList failed:%v", err)
  1114. break
  1115. }
  1116. temp.QueryTimes++
  1117. err = models.UpdateCloudbrainTemp(temp)
  1118. if err != nil {
  1119. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1120. }
  1121. if result != nil {
  1122. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1123. if result.VersionCount == int64(count+1) {
  1124. isExist = true
  1125. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1126. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1127. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1128. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1129. if err != nil {
  1130. log.Error("StopTrainJob failed:%v", err)
  1131. break
  1132. }
  1133. temp.Status = string(models.ModelArtsTrainJobKilling)
  1134. err = models.UpdateCloudbrainTemp(temp)
  1135. if err != nil {
  1136. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1137. break
  1138. }
  1139. } else {
  1140. log.Error("can not find the record(%s) till now", temp.JobName)
  1141. err = errors.New("not found")
  1142. break
  1143. }
  1144. }
  1145. break
  1146. }
  1147. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1148. log.Info("reach MaxTempQueryTimes, set the job failed")
  1149. temp.Status = string(models.ModelArtsTrainJobFailed)
  1150. err = models.UpdateCloudbrainTemp(temp)
  1151. if err != nil {
  1152. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1153. return err
  1154. }
  1155. }
  1156. return err
  1157. }
  1158. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1159. var err error
  1160. var isExist bool
  1161. for {
  1162. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1163. if err != nil {
  1164. log.Error("GetTrainJobList failed:%v", err)
  1165. break
  1166. }
  1167. temp.QueryTimes++
  1168. err = models.UpdateCloudbrainTemp(temp)
  1169. if err != nil {
  1170. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1171. }
  1172. if result != nil {
  1173. for _, job := range result.JobList {
  1174. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1175. isExist = true
  1176. temp.Status = TransTrainJobStatus(job.IntStatus)
  1177. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1178. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1179. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1180. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1181. if err != nil {
  1182. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1183. break
  1184. }
  1185. temp.Status = string(models.ModelArtsTrainJobKilling)
  1186. err = models.UpdateCloudbrainTemp(temp)
  1187. if err != nil {
  1188. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1189. break
  1190. }
  1191. }
  1192. }
  1193. if !isExist {
  1194. log.Error("can not find the record(%s) till now", temp.JobName)
  1195. err = errors.New("not found")
  1196. break
  1197. }
  1198. }
  1199. break
  1200. }
  1201. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1202. log.Info("reach MaxTempQueryTimes, set the job failed")
  1203. temp.Status = string(models.ModelArtsTrainJobFailed)
  1204. err = models.UpdateCloudbrainTemp(temp)
  1205. if err != nil {
  1206. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1207. return err
  1208. }
  1209. }
  1210. return err
  1211. }