You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

5 years ago
3 years ago
4 years ago
4 years ago
5 years ago
3 years ago
5 years ago
3 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226
  1. package modelarts
  2. import (
  3. "code.gitea.io/gitea/modules/modelarts_cd"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/storage"
  16. "code.gitea.io/gitea/modules/timeutil"
  17. )
  18. const (
  19. //notebook
  20. storageTypeOBS = "obs"
  21. autoStopDuration = 4 * 60 * 60
  22. autoStopDurationMs = 4 * 60 * 60 * 1000
  23. MORDELART_USER_IMAGE_ENGINE_ID = -1
  24. DataSetMountPath = "/home/ma-user/work"
  25. NotebookEnv = "Python3"
  26. NotebookType = "Ascend"
  27. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  28. //train-job
  29. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  30. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  31. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  35. // "]}"
  36. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  40. // "]}"
  41. CodePath = "/code/"
  42. OutputPath = "/output/"
  43. ResultPath = "/result/"
  44. LogPath = "/log/"
  45. JobPath = "/job/"
  46. OrderDesc = "desc" //向下查询
  47. OrderAsc = "asc" //向上查询
  48. Lines = 500
  49. TrainUrl = "train_url"
  50. DataUrl = "data_url"
  51. MultiDataUrl = "multi_data_url"
  52. ResultUrl = "result_url"
  53. CkptUrl = "ckpt_url"
  54. DeviceTarget = "device_target"
  55. Ascend = "Ascend"
  56. PerPage = 10
  57. IsLatestVersion = "1"
  58. NotLatestVersion = "0"
  59. VersionCountOne = 1
  60. SortByCreateTime = "create_time"
  61. ConfigTypeCustom = "custom"
  62. TotalVersionCount = 1
  63. )
  64. var (
  65. poolInfos *models.PoolInfos
  66. TrainFlavorInfos *Flavor
  67. SpecialPools *models.SpecialPools
  68. )
  69. type GenerateTrainJobReq struct {
  70. JobName string
  71. DisplayJobName string
  72. Uuid string
  73. Description string
  74. CodeObsPath string
  75. BootFile string
  76. BootFileUrl string
  77. DataUrl string
  78. TrainUrl string
  79. FlavorCode string
  80. LogUrl string
  81. PoolID string
  82. WorkServerNumber int
  83. EngineID int64
  84. Parameters []models.Parameter
  85. CommitID string
  86. IsLatestVersion string
  87. Params string
  88. BranchName string
  89. PreVersionId int64
  90. PreVersionName string
  91. FlavorName string
  92. VersionCount int
  93. EngineName string
  94. TotalVersionCount int
  95. UserImageUrl string
  96. UserCommand string
  97. DatasetName string
  98. }
  99. type GenerateInferenceJobReq struct {
  100. JobName string
  101. DisplayJobName string
  102. Uuid string
  103. Description string
  104. CodeObsPath string
  105. BootFile string
  106. BootFileUrl string
  107. DataUrl string
  108. TrainUrl string
  109. FlavorCode string
  110. LogUrl string
  111. PoolID string
  112. WorkServerNumber int
  113. EngineID int64
  114. Parameters []models.Parameter
  115. CommitID string
  116. Params string
  117. BranchName string
  118. FlavorName string
  119. EngineName string
  120. LabelName string
  121. IsLatestVersion string
  122. VersionCount int
  123. TotalVersionCount int
  124. ModelName string
  125. ModelVersion string
  126. CkptName string
  127. ResultUrl string
  128. DatasetName string
  129. }
  130. type VersionInfo struct {
  131. Version []struct {
  132. ID int `json:"id"`
  133. Value string `json:"value"`
  134. Url string `json:"url"`
  135. } `json:"version"`
  136. }
  137. type Flavor struct {
  138. Info []struct {
  139. Code string `json:"code"`
  140. Value string `json:"value"`
  141. } `json:"flavor"`
  142. }
  143. type Engine struct {
  144. Info []struct {
  145. ID int `json:"id"`
  146. Value string `json:"value"`
  147. } `json:"engine"`
  148. }
  149. type ResourcePool struct {
  150. Info []struct {
  151. ID string `json:"id"`
  152. Value string `json:"value"`
  153. } `json:"resource_pool"`
  154. }
  155. // type Parameter struct {
  156. // Label string `json:"label"`
  157. // Value string `json:"value"`
  158. // }
  159. // type Parameters struct {
  160. // Parameter []Parameter `json:"parameter"`
  161. // }
  162. type Parameters struct {
  163. Parameter []struct {
  164. Label string `json:"label"`
  165. Value string `json:"value"`
  166. } `json:"parameter"`
  167. }
  168. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  169. var dataActualPath string
  170. if uuid != "" {
  171. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  172. } else {
  173. userPath := setting.UserBasePath + ctx.User.Name + "/"
  174. isExist, err := storage.ObsHasObject(userPath)
  175. if err != nil {
  176. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  177. return err
  178. }
  179. if !isExist {
  180. if err = storage.ObsCreateObject(userPath); err != nil {
  181. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  182. return err
  183. }
  184. }
  185. dataActualPath = setting.Bucket + "/" + userPath
  186. }
  187. if poolInfos == nil {
  188. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  189. }
  190. createTime := timeutil.TimeStampNow()
  191. jobResult, err := CreateJob(models.CreateNotebookParams{
  192. JobName: jobName,
  193. Description: description,
  194. ProfileID: setting.ProfileID,
  195. Flavor: flavor,
  196. Pool: models.Pool{
  197. ID: poolInfos.PoolInfo[0].PoolId,
  198. Name: poolInfos.PoolInfo[0].PoolName,
  199. Type: poolInfos.PoolInfo[0].PoolType,
  200. },
  201. Spec: models.Spec{
  202. Storage: models.Storage{
  203. Type: storageTypeOBS,
  204. Location: models.Location{
  205. Path: dataActualPath,
  206. },
  207. },
  208. AutoStop: models.AutoStop{
  209. Enable: true,
  210. Duration: autoStopDuration,
  211. },
  212. },
  213. })
  214. if err != nil {
  215. log.Error("CreateJob failed: %v", err.Error())
  216. return err
  217. }
  218. err = models.CreateCloudbrain(&models.Cloudbrain{
  219. Status: string(models.JobWaiting),
  220. UserID: ctx.User.ID,
  221. RepoID: ctx.Repo.Repository.ID,
  222. JobID: jobResult.ID,
  223. JobName: jobName,
  224. JobType: string(models.JobTypeDebug),
  225. Type: models.TypeCloudBrainTwo,
  226. Uuid: uuid,
  227. ComputeResource: models.NPUResource,
  228. CreatedUnix: createTime,
  229. UpdatedUnix: createTime,
  230. })
  231. if err != nil {
  232. return err
  233. }
  234. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  235. return nil
  236. }
  237. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  238. if poolInfos == nil {
  239. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  240. }
  241. imageName, err := GetNotebookImageName(imageId)
  242. if err != nil {
  243. log.Error("GetNotebookImageName failed: %v", err.Error())
  244. return err
  245. }
  246. createTime := timeutil.TimeStampNow()
  247. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  248. JobName: jobName,
  249. Description: description,
  250. Flavor: flavor,
  251. Duration: autoStopDurationMs,
  252. ImageID: imageId,
  253. PoolID: poolInfos.PoolInfo[0].PoolId,
  254. Feature: models.NotebookFeature,
  255. Volume: models.VolumeReq{
  256. Capacity: setting.Capacity,
  257. Category: models.EVSCategory,
  258. Ownership: models.ManagedOwnership,
  259. },
  260. WorkspaceID: "0",
  261. })
  262. if err != nil {
  263. log.Error("createNotebook2 failed: %v", err.Error())
  264. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  265. log.Info("(%s)unknown error, set temp status", displayJobName)
  266. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  267. JobID: models.TempJobId,
  268. VersionID: models.TempVersionId,
  269. Status: models.TempJobStatus,
  270. Type: models.TypeCloudBrainTwo,
  271. JobName: jobName,
  272. JobType: string(models.JobTypeDebug),
  273. })
  274. if errTemp != nil {
  275. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  276. return errTemp
  277. }
  278. }
  279. return err
  280. }
  281. task := &models.Cloudbrain{
  282. Status: jobResult.Status,
  283. UserID: ctx.User.ID,
  284. RepoID: ctx.Repo.Repository.ID,
  285. JobID: jobResult.ID,
  286. JobName: jobName,
  287. FlavorCode: flavor,
  288. DisplayJobName: displayJobName,
  289. JobType: string(models.JobTypeDebug),
  290. Type: models.TypeCloudBrainTwo,
  291. Uuid: uuid,
  292. ComputeResource: models.NPUResource,
  293. Image: imageName,
  294. Description: description,
  295. CreatedUnix: createTime,
  296. UpdatedUnix: createTime,
  297. }
  298. err = models.CreateCloudbrain(task)
  299. if err != nil {
  300. return err
  301. }
  302. stringId := strconv.FormatInt(task.ID, 10)
  303. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  304. return nil
  305. }
  306. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  307. createTime := timeutil.TimeStampNow()
  308. var jobResult *models.CreateTrainJobResult
  309. var createErr error
  310. if req.EngineID < 0 {
  311. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  312. JobName: req.JobName,
  313. Description: req.Description,
  314. Config: models.UserImageConfig{
  315. WorkServerNum: req.WorkServerNumber,
  316. AppUrl: req.CodeObsPath,
  317. BootFileUrl: req.BootFileUrl,
  318. DataUrl: req.DataUrl,
  319. TrainUrl: req.TrainUrl,
  320. LogUrl: req.LogUrl,
  321. PoolID: req.PoolID,
  322. CreateVersion: true,
  323. Flavor: models.Flavor{
  324. Code: req.FlavorCode,
  325. },
  326. Parameter: req.Parameters,
  327. UserImageUrl: req.UserImageUrl,
  328. UserCommand: req.UserCommand,
  329. },
  330. })
  331. } else {
  332. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  333. JobName: req.JobName,
  334. Description: req.Description,
  335. Config: models.Config{
  336. WorkServerNum: req.WorkServerNumber,
  337. AppUrl: req.CodeObsPath,
  338. BootFileUrl: req.BootFileUrl,
  339. DataUrl: req.DataUrl,
  340. EngineID: req.EngineID,
  341. TrainUrl: req.TrainUrl,
  342. LogUrl: req.LogUrl,
  343. PoolID: req.PoolID,
  344. CreateVersion: true,
  345. Flavor: models.Flavor{
  346. Code: req.FlavorCode,
  347. },
  348. Parameter: req.Parameters,
  349. },
  350. })
  351. }
  352. if createErr != nil {
  353. log.Error("createTrainJob failed: %v", createErr.Error())
  354. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  355. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  356. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  357. JobID: models.TempJobId,
  358. VersionID: models.TempVersionId,
  359. Status: models.TempJobStatus,
  360. Type: models.TypeCloudBrainTwo,
  361. JobName: req.JobName,
  362. JobType: string(models.JobTypeTrain),
  363. })
  364. if errTemp != nil {
  365. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  366. return errTemp
  367. }
  368. }
  369. return createErr
  370. }
  371. jobId := strconv.FormatInt(jobResult.JobID, 10)
  372. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  373. Status: TransTrainJobStatus(jobResult.Status),
  374. UserID: ctx.User.ID,
  375. RepoID: ctx.Repo.Repository.ID,
  376. JobID: jobId,
  377. JobName: req.JobName,
  378. DisplayJobName: req.DisplayJobName,
  379. JobType: string(models.JobTypeTrain),
  380. Type: models.TypeCloudBrainTwo,
  381. VersionID: jobResult.VersionID,
  382. VersionName: jobResult.VersionName,
  383. Uuid: req.Uuid,
  384. DatasetName: req.DatasetName,
  385. CommitID: req.CommitID,
  386. IsLatestVersion: req.IsLatestVersion,
  387. ComputeResource: models.NPUResource,
  388. EngineID: req.EngineID,
  389. TrainUrl: req.TrainUrl,
  390. BranchName: req.BranchName,
  391. Parameters: req.Params,
  392. BootFile: req.BootFile,
  393. DataUrl: req.DataUrl,
  394. LogUrl: req.LogUrl,
  395. FlavorCode: req.FlavorCode,
  396. Description: req.Description,
  397. WorkServerNumber: req.WorkServerNumber,
  398. FlavorName: req.FlavorName,
  399. EngineName: req.EngineName,
  400. VersionCount: req.VersionCount,
  401. TotalVersionCount: req.TotalVersionCount,
  402. CreatedUnix: createTime,
  403. UpdatedUnix: createTime,
  404. })
  405. if createErr != nil {
  406. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  407. return createErr
  408. }
  409. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  410. return nil
  411. }
  412. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  413. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  414. JobName: req.JobName,
  415. Description: req.Description,
  416. Config: models.UserImageConfig{
  417. WorkServerNum: req.WorkServerNumber,
  418. AppUrl: req.CodeObsPath,
  419. BootFileUrl: req.BootFileUrl,
  420. DataUrl: req.DataUrl,
  421. TrainUrl: req.TrainUrl,
  422. LogUrl: req.LogUrl,
  423. PoolID: req.PoolID,
  424. CreateVersion: true,
  425. Flavor: models.Flavor{
  426. Code: req.FlavorCode,
  427. },
  428. Parameter: req.Parameters,
  429. UserImageUrl: req.UserImageUrl,
  430. UserCommand: req.UserCommand,
  431. },
  432. })
  433. }
  434. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  435. createTime := timeutil.TimeStampNow()
  436. var jobResult *models.CreateTrainJobResult
  437. var createErr error
  438. if req.EngineID < 0 {
  439. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  440. Description: req.Description,
  441. Config: models.TrainJobVersionUserImageConfig{
  442. WorkServerNum: req.WorkServerNumber,
  443. AppUrl: req.CodeObsPath,
  444. BootFileUrl: req.BootFileUrl,
  445. DataUrl: req.DataUrl,
  446. TrainUrl: req.TrainUrl,
  447. LogUrl: req.LogUrl,
  448. PoolID: req.PoolID,
  449. Flavor: models.Flavor{
  450. Code: req.FlavorCode,
  451. },
  452. Parameter: req.Parameters,
  453. PreVersionId: req.PreVersionId,
  454. UserImageUrl: req.UserImageUrl,
  455. UserCommand: req.UserCommand,
  456. },
  457. }, jobId)
  458. } else {
  459. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  460. Description: req.Description,
  461. Config: models.TrainJobVersionConfig{
  462. WorkServerNum: req.WorkServerNumber,
  463. AppUrl: req.CodeObsPath,
  464. BootFileUrl: req.BootFileUrl,
  465. DataUrl: req.DataUrl,
  466. EngineID: req.EngineID,
  467. TrainUrl: req.TrainUrl,
  468. LogUrl: req.LogUrl,
  469. PoolID: req.PoolID,
  470. Flavor: models.Flavor{
  471. Code: req.FlavorCode,
  472. },
  473. Parameter: req.Parameters,
  474. PreVersionId: req.PreVersionId,
  475. },
  476. }, jobId)
  477. }
  478. if createErr != nil {
  479. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  480. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  481. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  482. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  483. JobID: jobId,
  484. VersionID: models.TempVersionId,
  485. Status: models.TempJobStatus,
  486. Type: models.TypeCloudBrainTwo,
  487. JobName: req.JobName,
  488. JobType: string(models.JobTypeTrain),
  489. })
  490. if errTemp != nil {
  491. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  492. return errTemp
  493. }
  494. }
  495. return createErr
  496. }
  497. var jobTypes []string
  498. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  499. repo := ctx.Repo.Repository
  500. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  501. RepoID: repo.ID,
  502. Type: models.TypeCloudBrainTwo,
  503. JobTypes: jobTypes,
  504. JobID: strconv.FormatInt(jobResult.JobID, 10),
  505. })
  506. if createErr != nil {
  507. ctx.ServerError("Cloudbrain", createErr)
  508. return createErr
  509. }
  510. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  511. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  512. Status: TransTrainJobStatus(jobResult.Status),
  513. UserID: ctx.User.ID,
  514. RepoID: ctx.Repo.Repository.ID,
  515. JobID: strconv.FormatInt(jobResult.JobID, 10),
  516. JobName: req.JobName,
  517. DisplayJobName: req.DisplayJobName,
  518. JobType: string(models.JobTypeTrain),
  519. Type: models.TypeCloudBrainTwo,
  520. VersionID: jobResult.VersionID,
  521. VersionName: jobResult.VersionName,
  522. Uuid: req.Uuid,
  523. DatasetName: req.DatasetName,
  524. CommitID: req.CommitID,
  525. IsLatestVersion: req.IsLatestVersion,
  526. PreVersionName: req.PreVersionName,
  527. ComputeResource: models.NPUResource,
  528. EngineID: req.EngineID,
  529. TrainUrl: req.TrainUrl,
  530. BranchName: req.BranchName,
  531. Parameters: req.Params,
  532. BootFile: req.BootFile,
  533. DataUrl: req.DataUrl,
  534. LogUrl: req.LogUrl,
  535. PreVersionId: req.PreVersionId,
  536. FlavorCode: req.FlavorCode,
  537. Description: req.Description,
  538. WorkServerNumber: req.WorkServerNumber,
  539. FlavorName: req.FlavorName,
  540. EngineName: req.EngineName,
  541. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  542. VersionCount: VersionListCount + 1,
  543. CreatedUnix: createTime,
  544. UpdatedUnix: createTime,
  545. })
  546. if createErr != nil {
  547. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  548. return createErr
  549. }
  550. //将训练任务的上一版本的isLatestVersion设置为"0"
  551. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  552. if createErr != nil {
  553. ctx.ServerError("Update IsLatestVersion failed", createErr)
  554. return createErr
  555. }
  556. return createErr
  557. }
  558. func TransTrainJobStatus(status int) string {
  559. switch status {
  560. case 0:
  561. return "UNKNOWN"
  562. case 1:
  563. return "INIT"
  564. case 2:
  565. return "IMAGE_CREATING"
  566. case 3:
  567. return "IMAGE_FAILED"
  568. case 4:
  569. return "SUBMIT_TRYING"
  570. case 5:
  571. return "SUBMIT_FAILED"
  572. case 6:
  573. return "DELETE_FAILED"
  574. case 7:
  575. return "WAITING"
  576. case 8:
  577. return "RUNNING"
  578. case 9:
  579. return "KILLING"
  580. case 10:
  581. return "COMPLETED"
  582. case 11:
  583. return "FAILED"
  584. case 12:
  585. return "KILLED"
  586. case 13:
  587. return "CANCELED"
  588. case 14:
  589. return "LOST"
  590. case 15:
  591. return "SCALING"
  592. case 16:
  593. return "SUBMIT_MODEL_FAILED"
  594. case 17:
  595. return "DEPLOY_SERVICE_FAILED"
  596. case 18:
  597. return "CHECK_INIT"
  598. case 19:
  599. return "CHECK_RUNNING"
  600. case 20:
  601. return "CHECK_RUNNING_COMPLETED"
  602. case 21:
  603. return "CHECK_FAILED"
  604. default:
  605. return strconv.Itoa(status)
  606. }
  607. }
  608. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  609. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  610. VersionOutputPath = "V" + talVersionCountToString
  611. return VersionOutputPath
  612. }
  613. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  614. createTime := timeutil.TimeStampNow()
  615. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  616. JobName: req.JobName,
  617. Description: req.Description,
  618. InfConfig: models.InfConfig{
  619. WorkServerNum: req.WorkServerNumber,
  620. AppUrl: req.CodeObsPath,
  621. BootFileUrl: req.BootFileUrl,
  622. DataUrl: req.DataUrl,
  623. EngineID: req.EngineID,
  624. // TrainUrl: req.TrainUrl,
  625. LogUrl: req.LogUrl,
  626. PoolID: req.PoolID,
  627. CreateVersion: true,
  628. Flavor: models.Flavor{
  629. Code: req.FlavorCode,
  630. },
  631. Parameter: req.Parameters,
  632. },
  633. })
  634. if err != nil {
  635. log.Error("createInferenceJob failed: %v", err.Error())
  636. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  637. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  638. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  639. JobID: models.TempJobId,
  640. VersionID: models.TempVersionId,
  641. Status: models.TempJobStatus,
  642. Type: models.TypeCloudBrainTwo,
  643. JobName: req.JobName,
  644. JobType: string(models.JobTypeInference),
  645. })
  646. if err != nil {
  647. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  648. return err
  649. }
  650. }
  651. return err
  652. }
  653. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  654. // if err != nil {
  655. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  656. // return err
  657. // }
  658. jobID := strconv.FormatInt(jobResult.JobID, 10)
  659. err = models.CreateCloudbrain(&models.Cloudbrain{
  660. Status: TransTrainJobStatus(jobResult.Status),
  661. UserID: ctx.User.ID,
  662. RepoID: ctx.Repo.Repository.ID,
  663. JobID: jobID,
  664. JobName: req.JobName,
  665. DisplayJobName: req.DisplayJobName,
  666. JobType: string(models.JobTypeInference),
  667. Type: models.TypeCloudBrainTwo,
  668. VersionID: jobResult.VersionID,
  669. VersionName: jobResult.VersionName,
  670. Uuid: req.Uuid,
  671. DatasetName: req.DatasetName,
  672. CommitID: req.CommitID,
  673. EngineID: req.EngineID,
  674. TrainUrl: req.TrainUrl,
  675. BranchName: req.BranchName,
  676. Parameters: req.Params,
  677. BootFile: req.BootFile,
  678. DataUrl: req.DataUrl,
  679. LogUrl: req.LogUrl,
  680. FlavorCode: req.FlavorCode,
  681. Description: req.Description,
  682. WorkServerNumber: req.WorkServerNumber,
  683. FlavorName: req.FlavorName,
  684. EngineName: req.EngineName,
  685. LabelName: req.LabelName,
  686. IsLatestVersion: req.IsLatestVersion,
  687. ComputeResource: models.NPUResource,
  688. VersionCount: req.VersionCount,
  689. TotalVersionCount: req.TotalVersionCount,
  690. ModelName: req.ModelName,
  691. ModelVersion: req.ModelVersion,
  692. CkptName: req.CkptName,
  693. ResultUrl: req.ResultUrl,
  694. CreatedUnix: createTime,
  695. UpdatedUnix: createTime,
  696. })
  697. if err != nil {
  698. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  699. return err
  700. }
  701. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  702. return nil
  703. }
  704. func GetNotebookImageName(imageId string) (string, error) {
  705. var validImage = false
  706. var imageName = ""
  707. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  708. if imageInfo.Id == imageId {
  709. validImage = true
  710. imageName = imageInfo.Value
  711. }
  712. }
  713. if !validImage {
  714. log.Error("the image id(%s) is invalid", imageId)
  715. return imageName, errors.New("the image id is invalid")
  716. }
  717. return imageName, nil
  718. }
  719. func InitSpecialPool() {
  720. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  721. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  722. }
  723. }
  724. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  725. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  726. if err != nil {
  727. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  728. return err
  729. }
  730. if result != nil {
  731. oldStatus := task.Status
  732. task.Status = TransTrainJobStatus(result.IntStatus)
  733. task.Duration = result.Duration / 1000
  734. task.TrainJobDuration = result.TrainJobDuration
  735. if task.StartTime == 0 && result.StartTime > 0 {
  736. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  737. }
  738. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  739. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  740. task.EndTime = task.StartTime.Add(task.Duration)
  741. }
  742. task.CorrectCreateUnix()
  743. if oldStatus != task.Status {
  744. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  745. }
  746. err = models.UpdateJob(task)
  747. if err != nil {
  748. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  749. return err
  750. }
  751. }
  752. return nil
  753. }
  754. func HandleNotebookInfo(task *models.Cloudbrain) error {
  755. var result *models.GetNotebook2Result
  756. var err error
  757. if task.Type == models.TypeCloudBrainTwo {
  758. result, err = GetNotebook2(task.JobID)
  759. } else if task.Type == models.TypeCDCenter {
  760. result, err = modelarts_cd.GetNotebook(task.JobID)
  761. }
  762. if err != nil {
  763. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  764. return err
  765. }
  766. if result != nil {
  767. oldStatus := task.Status
  768. task.Status = result.Status
  769. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  770. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  771. }
  772. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  773. task.EndTime = timeutil.TimeStampNow()
  774. }
  775. task.CorrectCreateUnix()
  776. task.ComputeAndSetDuration()
  777. if oldStatus != task.Status {
  778. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  779. }
  780. if task.FlavorCode == "" {
  781. task.FlavorCode = result.Flavor
  782. }
  783. err = models.UpdateJob(task)
  784. if err != nil {
  785. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  786. return err
  787. }
  788. }
  789. return nil
  790. }
  791. func SyncTempStatusJob() {
  792. jobs, err := models.GetCloudBrainTempJobs()
  793. if err != nil {
  794. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  795. return
  796. }
  797. for _, temp := range jobs {
  798. log.Info("start to handle record: %s", temp.JobName)
  799. if temp.Type == models.TypeCloudBrainTwo {
  800. if temp.JobType == string(models.JobTypeDebug) {
  801. err = handleNotebook(temp)
  802. if err != nil {
  803. log.Error("handleNotebook falied:%v", err)
  804. break
  805. }
  806. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  807. _, err = models.GetCloudbrainByJobID(temp.JobID)
  808. if err != nil {
  809. //one version
  810. err = handleTrainJob(temp)
  811. if err != nil {
  812. log.Error("handleTrainJob falied:%v", err)
  813. break
  814. }
  815. } else {
  816. //multi version
  817. err = handleTrainJobMultiVersion(temp)
  818. if err != nil {
  819. log.Error("handleTrainJobMultiVersion falied:%v", err)
  820. break
  821. }
  822. }
  823. }
  824. }
  825. }
  826. return
  827. }
  828. func handleNotebook(temp *models.CloudbrainTemp) error {
  829. if temp.Status == models.TempJobStatus {
  830. err := handleTempNotebook(temp)
  831. if err != nil {
  832. log.Error("handleTempNotebook failed:%v", err)
  833. return err
  834. }
  835. } else if temp.Status == string(models.ModelArtsStopping) {
  836. res, err := GetNotebook2(temp.JobID)
  837. if err != nil {
  838. log.Error("GetNotebook2 failed:%v", err)
  839. return err
  840. }
  841. temp.Status = res.Status
  842. if temp.Status == string(models.ModelArtsStopped) {
  843. err = models.UpdateCloudbrainTemp(temp)
  844. if err != nil {
  845. log.Error("UpdateCloudbrainTemp failed:%v", err)
  846. return err
  847. }
  848. _, err := DelNotebook2(temp.JobID)
  849. if err != nil {
  850. log.Error("DelNotebook2 failed:%v", err)
  851. return err
  852. }
  853. temp.Status = string(models.ModelArtsDeleted)
  854. err = models.UpdateCloudbrainTemp(temp)
  855. if err != nil {
  856. log.Error("UpdateCloudbrainTemp failed:%v", err)
  857. return err
  858. }
  859. }
  860. }
  861. return nil
  862. }
  863. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  864. var err error
  865. var isExist bool
  866. for {
  867. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  868. if err != nil {
  869. log.Error("GetNotebookList failed:%v", err)
  870. break
  871. }
  872. temp.QueryTimes++
  873. err = models.UpdateCloudbrainTemp(temp)
  874. if err != nil {
  875. log.Error("UpdateCloudbrainTemp failed:%v", err)
  876. }
  877. if result != nil {
  878. for _, notebook := range result.NotebookList {
  879. if temp.JobID == models.TempJobId {
  880. //new notebook
  881. if notebook.JobName == temp.JobName {
  882. isExist = true
  883. temp.Status = notebook.Status
  884. temp.JobID = notebook.JobID
  885. break
  886. }
  887. } else {
  888. //restart: always can find one record
  889. if notebook.JobName == temp.JobName {
  890. if notebook.Status != string(models.ModelArtsStopped) {
  891. isExist = true
  892. temp.Status = notebook.Status
  893. temp.JobID = notebook.JobID
  894. break
  895. }
  896. }
  897. }
  898. }
  899. if isExist {
  900. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  901. if temp.Status == string(models.ModelArtsCreateFailed) {
  902. err = models.UpdateCloudbrainTemp(temp)
  903. if err != nil {
  904. log.Error("UpdateCloudbrainTemp failed:%v", err)
  905. break
  906. }
  907. _, err := DelNotebook2(temp.JobID)
  908. if err != nil {
  909. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  910. break
  911. }
  912. temp.Status = string(models.ModelArtsDeleted)
  913. } else {
  914. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  915. if err != nil {
  916. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  917. break
  918. }
  919. temp.Status = string(models.ModelArtsStopping)
  920. }
  921. models.UpdateCloudbrainTemp(temp)
  922. } else {
  923. log.Error("can not find the record(%s) till now", temp.JobName)
  924. err = errors.New("not found")
  925. break
  926. }
  927. } else {
  928. log.Error("can not find the record(%s) till now", temp.JobName)
  929. err = errors.New("not found")
  930. break
  931. }
  932. break
  933. }
  934. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  935. log.Info("reach MaxTempQueryTimes, set the job failed")
  936. temp.Status = string(models.ModelArtsTrainJobFailed)
  937. err = models.UpdateCloudbrainTemp(temp)
  938. if err != nil {
  939. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  940. return err
  941. }
  942. }
  943. return err
  944. }
  945. func handleTrainJob(temp *models.CloudbrainTemp) error {
  946. if temp.Status == models.TempJobStatus {
  947. err := handleTempTrainJob(temp)
  948. if err != nil {
  949. log.Error("handleTempTrainJob failed:%v", err)
  950. return err
  951. }
  952. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  953. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  954. if err != nil {
  955. log.Error("GetTrainJob failed:%v", err)
  956. return err
  957. }
  958. temp.Status = TransTrainJobStatus(res.IntStatus)
  959. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  960. err = models.UpdateCloudbrainTemp(temp)
  961. if err != nil {
  962. log.Error("UpdateCloudbrainTemp failed:%v", err)
  963. return err
  964. }
  965. _, err := DelTrainJob(temp.JobID)
  966. if err != nil {
  967. log.Error("DelTrainJob failed:%v", err)
  968. return err
  969. }
  970. temp.Status = string(models.ModelArtsDeleted)
  971. err = models.UpdateCloudbrainTemp(temp)
  972. if err != nil {
  973. log.Error("UpdateCloudbrainTemp failed:%v", err)
  974. return err
  975. }
  976. }
  977. }
  978. return nil
  979. }
  980. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  981. if temp.Status == models.TempJobStatus {
  982. err := handleTempTrainJobMultiVersion(temp)
  983. if err != nil {
  984. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  985. return err
  986. }
  987. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  988. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  989. if err != nil {
  990. log.Error("GetTrainJob failed:%v", err)
  991. return err
  992. }
  993. temp.Status = TransTrainJobStatus(res.IntStatus)
  994. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  995. err = models.UpdateCloudbrainTemp(temp)
  996. if err != nil {
  997. log.Error("UpdateCloudbrainTemp failed:%v", err)
  998. return err
  999. }
  1000. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1001. if err != nil {
  1002. log.Error("DelTrainJob failed:%v", err)
  1003. return err
  1004. }
  1005. temp.Status = string(models.ModelArtsDeleted)
  1006. err = models.UpdateCloudbrainTemp(temp)
  1007. if err != nil {
  1008. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1009. return err
  1010. }
  1011. }
  1012. }
  1013. return nil
  1014. }
  1015. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1016. var err error
  1017. var isExist bool
  1018. for {
  1019. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1020. if err != nil {
  1021. log.Error("GetTrainJobVersionList failed:%v", err)
  1022. break
  1023. }
  1024. temp.QueryTimes++
  1025. err = models.UpdateCloudbrainTemp(temp)
  1026. if err != nil {
  1027. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1028. }
  1029. if result != nil {
  1030. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1031. if result.VersionCount == int64(count+1) {
  1032. isExist = true
  1033. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1034. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1035. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1036. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1037. if err != nil {
  1038. log.Error("StopTrainJob failed:%v", err)
  1039. break
  1040. }
  1041. temp.Status = string(models.ModelArtsTrainJobKilling)
  1042. err = models.UpdateCloudbrainTemp(temp)
  1043. if err != nil {
  1044. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1045. break
  1046. }
  1047. } else {
  1048. log.Error("can not find the record(%s) till now", temp.JobName)
  1049. err = errors.New("not found")
  1050. break
  1051. }
  1052. }
  1053. break
  1054. }
  1055. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1056. log.Info("reach MaxTempQueryTimes, set the job failed")
  1057. temp.Status = string(models.ModelArtsTrainJobFailed)
  1058. err = models.UpdateCloudbrainTemp(temp)
  1059. if err != nil {
  1060. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1061. return err
  1062. }
  1063. }
  1064. return err
  1065. }
  1066. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1067. var err error
  1068. var isExist bool
  1069. for {
  1070. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1071. if err != nil {
  1072. log.Error("GetTrainJobList failed:%v", err)
  1073. break
  1074. }
  1075. temp.QueryTimes++
  1076. err = models.UpdateCloudbrainTemp(temp)
  1077. if err != nil {
  1078. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1079. }
  1080. if result != nil {
  1081. for _, job := range result.JobList {
  1082. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1083. isExist = true
  1084. temp.Status = TransTrainJobStatus(job.IntStatus)
  1085. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1086. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1087. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1088. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1089. if err != nil {
  1090. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1091. break
  1092. }
  1093. temp.Status = string(models.ModelArtsTrainJobKilling)
  1094. err = models.UpdateCloudbrainTemp(temp)
  1095. if err != nil {
  1096. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1097. break
  1098. }
  1099. }
  1100. }
  1101. if !isExist {
  1102. log.Error("can not find the record(%s) till now", temp.JobName)
  1103. err = errors.New("not found")
  1104. break
  1105. }
  1106. }
  1107. break
  1108. }
  1109. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1110. log.Info("reach MaxTempQueryTimes, set the job failed")
  1111. temp.Status = string(models.ModelArtsTrainJobFailed)
  1112. err = models.UpdateCloudbrainTemp(temp)
  1113. if err != nil {
  1114. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1115. return err
  1116. }
  1117. }
  1118. return err
  1119. }