You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

5 years ago
4 years ago
4 years ago
5 years ago
3 years ago
5 years ago
3 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
5 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
5 years ago
5 years ago
3 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
5 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/storage"
  15. "code.gitea.io/gitea/modules/timeutil"
  16. )
  17. const (
  18. //notebook
  19. storageTypeOBS = "obs"
  20. autoStopDuration = 4 * 60 * 60
  21. autoStopDurationMs = 4 * 60 * 60 * 1000
  22. MORDELART_USER_IMAGE_ENGINE_ID = -1
  23. DataSetMountPath = "/home/ma-user/work"
  24. NotebookEnv = "Python3"
  25. NotebookType = "Ascend"
  26. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  27. //train-job
  28. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  29. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  30. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  31. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  34. // "]}"
  35. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  36. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  39. // "]}"
  40. CodePath = "/code/"
  41. OutputPath = "/output/"
  42. ResultPath = "/result/"
  43. LogPath = "/log/"
  44. JobPath = "/job/"
  45. OrderDesc = "desc" //向下查询
  46. OrderAsc = "asc" //向上查询
  47. Lines = 500
  48. TrainUrl = "train_url"
  49. DataUrl = "data_url"
  50. MultiDataUrl = "multi_data_url"
  51. ResultUrl = "result_url"
  52. CkptUrl = "ckpt_url"
  53. DeviceTarget = "device_target"
  54. Ascend = "Ascend"
  55. PerPage = 10
  56. IsLatestVersion = "1"
  57. NotLatestVersion = "0"
  58. VersionCountOne = 1
  59. SortByCreateTime = "create_time"
  60. ConfigTypeCustom = "custom"
  61. TotalVersionCount = 1
  62. )
  63. var (
  64. poolInfos *models.PoolInfos
  65. FlavorInfos *models.FlavorInfos
  66. ImageInfos *models.ImageInfosModelArts
  67. TrainFlavorInfos *Flavor
  68. SpecialPools *models.SpecialPools
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. LogUrl string
  81. PoolID string
  82. WorkServerNumber int
  83. EngineID int64
  84. Parameters []models.Parameter
  85. CommitID string
  86. IsLatestVersion string
  87. Params string
  88. BranchName string
  89. PreVersionId int64
  90. PreVersionName string
  91. FlavorCode string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. Spec *models.Specification
  100. }
  101. type GenerateInferenceJobReq struct {
  102. JobName string
  103. DisplayJobName string
  104. Uuid string
  105. Description string
  106. CodeObsPath string
  107. BootFile string
  108. BootFileUrl string
  109. DataUrl string
  110. TrainUrl string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. Spec *models.Specification
  130. DatasetName string
  131. }
  132. type VersionInfo struct {
  133. Version []struct {
  134. ID int `json:"id"`
  135. Value string `json:"value"`
  136. Url string `json:"url"`
  137. } `json:"version"`
  138. }
  139. type Flavor struct {
  140. Info []struct {
  141. Code string `json:"code"`
  142. Value string `json:"value"`
  143. UnitPrice int64 `json:"unitPrice"`
  144. } `json:"flavor"`
  145. }
  146. type Engine struct {
  147. Info []struct {
  148. ID int `json:"id"`
  149. Value string `json:"value"`
  150. } `json:"engine"`
  151. }
  152. type ResourcePool struct {
  153. Info []struct {
  154. ID string `json:"id"`
  155. Value string `json:"value"`
  156. } `json:"resource_pool"`
  157. }
  158. // type Parameter struct {
  159. // Label string `json:"label"`
  160. // Value string `json:"value"`
  161. // }
  162. // type Parameters struct {
  163. // Parameter []Parameter `json:"parameter"`
  164. // }
  165. type Parameters struct {
  166. Parameter []struct {
  167. Label string `json:"label"`
  168. Value string `json:"value"`
  169. } `json:"parameter"`
  170. }
  171. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  172. var dataActualPath string
  173. if uuid != "" {
  174. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  175. } else {
  176. userPath := setting.UserBasePath + ctx.User.Name + "/"
  177. isExist, err := storage.ObsHasObject(userPath)
  178. if err != nil {
  179. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  180. return err
  181. }
  182. if !isExist {
  183. if err = storage.ObsCreateObject(userPath); err != nil {
  184. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  185. return err
  186. }
  187. }
  188. dataActualPath = setting.Bucket + "/" + userPath
  189. }
  190. if poolInfos == nil {
  191. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  192. }
  193. createTime := timeutil.TimeStampNow()
  194. jobResult, err := CreateJob(models.CreateNotebookParams{
  195. JobName: jobName,
  196. Description: description,
  197. ProfileID: setting.ProfileID,
  198. Flavor: flavor,
  199. Pool: models.Pool{
  200. ID: poolInfos.PoolInfo[0].PoolId,
  201. Name: poolInfos.PoolInfo[0].PoolName,
  202. Type: poolInfos.PoolInfo[0].PoolType,
  203. },
  204. Spec: models.Spec{
  205. Storage: models.Storage{
  206. Type: storageTypeOBS,
  207. Location: models.Location{
  208. Path: dataActualPath,
  209. },
  210. },
  211. AutoStop: models.AutoStop{
  212. Enable: true,
  213. Duration: autoStopDuration,
  214. },
  215. },
  216. })
  217. if err != nil {
  218. log.Error("CreateJob failed: %v", err.Error())
  219. return err
  220. }
  221. err = models.CreateCloudbrain(&models.Cloudbrain{
  222. Status: string(models.JobWaiting),
  223. UserID: ctx.User.ID,
  224. RepoID: ctx.Repo.Repository.ID,
  225. JobID: jobResult.ID,
  226. JobName: jobName,
  227. JobType: string(models.JobTypeDebug),
  228. Type: models.TypeCloudBrainTwo,
  229. Uuid: uuid,
  230. ComputeResource: models.NPUResource,
  231. CreatedUnix: createTime,
  232. UpdatedUnix: createTime,
  233. })
  234. if err != nil {
  235. return err
  236. }
  237. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  238. return nil
  239. }
  240. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  241. if poolInfos == nil {
  242. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  243. }
  244. imageName, err := GetNotebookImageName(imageId)
  245. if err != nil {
  246. log.Error("GetNotebookImageName failed: %v", err.Error())
  247. return err
  248. }
  249. createTime := timeutil.TimeStampNow()
  250. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  251. JobName: jobName,
  252. Description: description,
  253. Flavor: spec.SourceSpecId,
  254. Duration: autoStopDurationMs,
  255. ImageID: imageId,
  256. PoolID: poolInfos.PoolInfo[0].PoolId,
  257. Feature: models.NotebookFeature,
  258. Volume: models.VolumeReq{
  259. Capacity: setting.Capacity,
  260. Category: models.EVSCategory,
  261. Ownership: models.ManagedOwnership,
  262. },
  263. WorkspaceID: "0",
  264. })
  265. if err != nil {
  266. log.Error("createNotebook2 failed: %v", err.Error())
  267. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  268. log.Info("(%s)unknown error, set temp status", displayJobName)
  269. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  270. JobID: models.TempJobId,
  271. VersionID: models.TempVersionId,
  272. Status: models.TempJobStatus,
  273. Type: models.TypeCloudBrainTwo,
  274. JobName: jobName,
  275. JobType: string(models.JobTypeDebug),
  276. })
  277. if errTemp != nil {
  278. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  279. return errTemp
  280. }
  281. }
  282. return err
  283. }
  284. task := &models.Cloudbrain{
  285. Status: jobResult.Status,
  286. UserID: ctx.User.ID,
  287. RepoID: ctx.Repo.Repository.ID,
  288. JobID: jobResult.ID,
  289. JobName: jobName,
  290. FlavorCode: spec.SourceSpecId,
  291. DisplayJobName: displayJobName,
  292. JobType: string(models.JobTypeDebug),
  293. Type: models.TypeCloudBrainTwo,
  294. Uuid: uuid,
  295. ComputeResource: models.NPUResource,
  296. Image: imageName,
  297. Description: description,
  298. CreatedUnix: createTime,
  299. UpdatedUnix: createTime,
  300. Spec: spec,
  301. }
  302. err = models.CreateCloudbrain(task)
  303. if err != nil {
  304. return err
  305. }
  306. stringId := strconv.FormatInt(task.ID, 10)
  307. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  308. return nil
  309. }
  310. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  311. createTime := timeutil.TimeStampNow()
  312. var jobResult *models.CreateTrainJobResult
  313. var createErr error
  314. if req.EngineID < 0 {
  315. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  316. JobName: req.JobName,
  317. Description: req.Description,
  318. Config: models.UserImageConfig{
  319. WorkServerNum: req.WorkServerNumber,
  320. AppUrl: req.CodeObsPath,
  321. BootFileUrl: req.BootFileUrl,
  322. DataUrl: req.DataUrl,
  323. TrainUrl: req.TrainUrl,
  324. LogUrl: req.LogUrl,
  325. PoolID: req.PoolID,
  326. CreateVersion: true,
  327. Flavor: models.Flavor{
  328. Code: req.Spec.SourceSpecId,
  329. },
  330. Parameter: req.Parameters,
  331. UserImageUrl: req.UserImageUrl,
  332. UserCommand: req.UserCommand,
  333. },
  334. })
  335. } else {
  336. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  337. JobName: req.JobName,
  338. Description: req.Description,
  339. Config: models.Config{
  340. WorkServerNum: req.WorkServerNumber,
  341. AppUrl: req.CodeObsPath,
  342. BootFileUrl: req.BootFileUrl,
  343. DataUrl: req.DataUrl,
  344. EngineID: req.EngineID,
  345. TrainUrl: req.TrainUrl,
  346. LogUrl: req.LogUrl,
  347. PoolID: req.PoolID,
  348. CreateVersion: true,
  349. Flavor: models.Flavor{
  350. Code: req.Spec.SourceSpecId,
  351. },
  352. Parameter: req.Parameters,
  353. },
  354. })
  355. }
  356. if createErr != nil {
  357. log.Error("createTrainJob failed: %v", createErr.Error())
  358. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  359. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  360. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  361. JobID: models.TempJobId,
  362. VersionID: models.TempVersionId,
  363. Status: models.TempJobStatus,
  364. Type: models.TypeCloudBrainTwo,
  365. JobName: req.JobName,
  366. JobType: string(models.JobTypeTrain),
  367. })
  368. if errTemp != nil {
  369. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  370. return errTemp
  371. }
  372. }
  373. return createErr
  374. }
  375. jobId := strconv.FormatInt(jobResult.JobID, 10)
  376. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  377. Status: TransTrainJobStatus(jobResult.Status),
  378. UserID: ctx.User.ID,
  379. RepoID: ctx.Repo.Repository.ID,
  380. JobID: jobId,
  381. JobName: req.JobName,
  382. DisplayJobName: req.DisplayJobName,
  383. JobType: string(models.JobTypeTrain),
  384. Type: models.TypeCloudBrainTwo,
  385. VersionID: jobResult.VersionID,
  386. VersionName: jobResult.VersionName,
  387. Uuid: req.Uuid,
  388. DatasetName: req.DatasetName,
  389. CommitID: req.CommitID,
  390. IsLatestVersion: req.IsLatestVersion,
  391. ComputeResource: models.NPUResource,
  392. EngineID: req.EngineID,
  393. TrainUrl: req.TrainUrl,
  394. BranchName: req.BranchName,
  395. Parameters: req.Params,
  396. BootFile: req.BootFile,
  397. DataUrl: req.DataUrl,
  398. LogUrl: req.LogUrl,
  399. FlavorCode: req.Spec.SourceSpecId,
  400. Description: req.Description,
  401. WorkServerNumber: req.WorkServerNumber,
  402. FlavorName: req.FlavorName,
  403. EngineName: req.EngineName,
  404. VersionCount: req.VersionCount,
  405. TotalVersionCount: req.TotalVersionCount,
  406. CreatedUnix: createTime,
  407. UpdatedUnix: createTime,
  408. Spec: req.Spec,
  409. })
  410. if createErr != nil {
  411. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  412. return createErr
  413. }
  414. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  415. return nil
  416. }
  417. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  418. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  419. JobName: req.JobName,
  420. Description: req.Description,
  421. Config: models.UserImageConfig{
  422. WorkServerNum: req.WorkServerNumber,
  423. AppUrl: req.CodeObsPath,
  424. BootFileUrl: req.BootFileUrl,
  425. DataUrl: req.DataUrl,
  426. TrainUrl: req.TrainUrl,
  427. LogUrl: req.LogUrl,
  428. PoolID: req.PoolID,
  429. CreateVersion: true,
  430. Flavor: models.Flavor{
  431. Code: req.FlavorCode,
  432. },
  433. Parameter: req.Parameters,
  434. UserImageUrl: req.UserImageUrl,
  435. UserCommand: req.UserCommand,
  436. },
  437. })
  438. }
  439. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  440. createTime := timeutil.TimeStampNow()
  441. var jobResult *models.CreateTrainJobResult
  442. var createErr error
  443. if req.EngineID < 0 {
  444. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  445. Description: req.Description,
  446. Config: models.TrainJobVersionUserImageConfig{
  447. WorkServerNum: req.WorkServerNumber,
  448. AppUrl: req.CodeObsPath,
  449. BootFileUrl: req.BootFileUrl,
  450. DataUrl: req.DataUrl,
  451. TrainUrl: req.TrainUrl,
  452. LogUrl: req.LogUrl,
  453. PoolID: req.PoolID,
  454. Flavor: models.Flavor{
  455. Code: req.Spec.SourceSpecId,
  456. },
  457. Parameter: req.Parameters,
  458. PreVersionId: req.PreVersionId,
  459. UserImageUrl: req.UserImageUrl,
  460. UserCommand: req.UserCommand,
  461. },
  462. }, jobId)
  463. } else {
  464. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  465. Description: req.Description,
  466. Config: models.TrainJobVersionConfig{
  467. WorkServerNum: req.WorkServerNumber,
  468. AppUrl: req.CodeObsPath,
  469. BootFileUrl: req.BootFileUrl,
  470. DataUrl: req.DataUrl,
  471. EngineID: req.EngineID,
  472. TrainUrl: req.TrainUrl,
  473. LogUrl: req.LogUrl,
  474. PoolID: req.PoolID,
  475. Flavor: models.Flavor{
  476. Code: req.Spec.SourceSpecId,
  477. },
  478. Parameter: req.Parameters,
  479. PreVersionId: req.PreVersionId,
  480. },
  481. }, jobId)
  482. }
  483. if createErr != nil {
  484. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  485. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  486. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  487. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  488. JobID: jobId,
  489. VersionID: models.TempVersionId,
  490. Status: models.TempJobStatus,
  491. Type: models.TypeCloudBrainTwo,
  492. JobName: req.JobName,
  493. JobType: string(models.JobTypeTrain),
  494. })
  495. if errTemp != nil {
  496. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  497. return errTemp
  498. }
  499. }
  500. return createErr
  501. }
  502. var jobTypes []string
  503. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  504. repo := ctx.Repo.Repository
  505. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  506. RepoID: repo.ID,
  507. Type: models.TypeCloudBrainTwo,
  508. JobTypes: jobTypes,
  509. JobID: strconv.FormatInt(jobResult.JobID, 10),
  510. })
  511. if createErr != nil {
  512. ctx.ServerError("Cloudbrain", createErr)
  513. return createErr
  514. }
  515. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  516. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  517. Status: TransTrainJobStatus(jobResult.Status),
  518. UserID: ctx.User.ID,
  519. RepoID: ctx.Repo.Repository.ID,
  520. JobID: strconv.FormatInt(jobResult.JobID, 10),
  521. JobName: req.JobName,
  522. DisplayJobName: req.DisplayJobName,
  523. JobType: string(models.JobTypeTrain),
  524. Type: models.TypeCloudBrainTwo,
  525. VersionID: jobResult.VersionID,
  526. VersionName: jobResult.VersionName,
  527. Uuid: req.Uuid,
  528. DatasetName: req.DatasetName,
  529. CommitID: req.CommitID,
  530. IsLatestVersion: req.IsLatestVersion,
  531. PreVersionName: req.PreVersionName,
  532. ComputeResource: models.NPUResource,
  533. EngineID: req.EngineID,
  534. TrainUrl: req.TrainUrl,
  535. BranchName: req.BranchName,
  536. Parameters: req.Params,
  537. BootFile: req.BootFile,
  538. DataUrl: req.DataUrl,
  539. LogUrl: req.LogUrl,
  540. PreVersionId: req.PreVersionId,
  541. FlavorCode: req.Spec.SourceSpecId,
  542. Description: req.Description,
  543. WorkServerNumber: req.WorkServerNumber,
  544. FlavorName: req.FlavorName,
  545. EngineName: req.EngineName,
  546. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  547. VersionCount: VersionListCount + 1,
  548. CreatedUnix: createTime,
  549. UpdatedUnix: createTime,
  550. Spec: req.Spec,
  551. })
  552. if createErr != nil {
  553. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  554. return createErr
  555. }
  556. //将训练任务的上一版本的isLatestVersion设置为"0"
  557. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  558. if createErr != nil {
  559. ctx.ServerError("Update IsLatestVersion failed", createErr)
  560. return createErr
  561. }
  562. return createErr
  563. }
  564. func TransTrainJobStatus(status int) string {
  565. switch status {
  566. case 0:
  567. return "UNKNOWN"
  568. case 1:
  569. return "INIT"
  570. case 2:
  571. return "IMAGE_CREATING"
  572. case 3:
  573. return "IMAGE_FAILED"
  574. case 4:
  575. return "SUBMIT_TRYING"
  576. case 5:
  577. return "SUBMIT_FAILED"
  578. case 6:
  579. return "DELETE_FAILED"
  580. case 7:
  581. return "WAITING"
  582. case 8:
  583. return "RUNNING"
  584. case 9:
  585. return "KILLING"
  586. case 10:
  587. return "COMPLETED"
  588. case 11:
  589. return "FAILED"
  590. case 12:
  591. return "KILLED"
  592. case 13:
  593. return "CANCELED"
  594. case 14:
  595. return "LOST"
  596. case 15:
  597. return "SCALING"
  598. case 16:
  599. return "SUBMIT_MODEL_FAILED"
  600. case 17:
  601. return "DEPLOY_SERVICE_FAILED"
  602. case 18:
  603. return "CHECK_INIT"
  604. case 19:
  605. return "CHECK_RUNNING"
  606. case 20:
  607. return "CHECK_RUNNING_COMPLETED"
  608. case 21:
  609. return "CHECK_FAILED"
  610. default:
  611. return strconv.Itoa(status)
  612. }
  613. }
  614. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  615. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  616. VersionOutputPath = "V" + talVersionCountToString
  617. return VersionOutputPath
  618. }
  619. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  620. createTime := timeutil.TimeStampNow()
  621. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  622. JobName: req.JobName,
  623. Description: req.Description,
  624. InfConfig: models.InfConfig{
  625. WorkServerNum: req.WorkServerNumber,
  626. AppUrl: req.CodeObsPath,
  627. BootFileUrl: req.BootFileUrl,
  628. DataUrl: req.DataUrl,
  629. EngineID: req.EngineID,
  630. // TrainUrl: req.TrainUrl,
  631. LogUrl: req.LogUrl,
  632. PoolID: req.PoolID,
  633. CreateVersion: true,
  634. Flavor: models.Flavor{
  635. Code: req.Spec.SourceSpecId,
  636. },
  637. Parameter: req.Parameters,
  638. },
  639. })
  640. if err != nil {
  641. log.Error("createInferenceJob failed: %v", err.Error())
  642. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  643. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  644. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  645. JobID: models.TempJobId,
  646. VersionID: models.TempVersionId,
  647. Status: models.TempJobStatus,
  648. Type: models.TypeCloudBrainTwo,
  649. JobName: req.JobName,
  650. JobType: string(models.JobTypeInference),
  651. })
  652. if err != nil {
  653. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  654. return err
  655. }
  656. }
  657. return err
  658. }
  659. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  660. // if err != nil {
  661. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  662. // return err
  663. // }
  664. jobID := strconv.FormatInt(jobResult.JobID, 10)
  665. err = models.CreateCloudbrain(&models.Cloudbrain{
  666. Status: TransTrainJobStatus(jobResult.Status),
  667. UserID: ctx.User.ID,
  668. RepoID: ctx.Repo.Repository.ID,
  669. JobID: jobID,
  670. JobName: req.JobName,
  671. DisplayJobName: req.DisplayJobName,
  672. JobType: string(models.JobTypeInference),
  673. Type: models.TypeCloudBrainTwo,
  674. VersionID: jobResult.VersionID,
  675. VersionName: jobResult.VersionName,
  676. Uuid: req.Uuid,
  677. DatasetName: req.DatasetName,
  678. CommitID: req.CommitID,
  679. EngineID: req.EngineID,
  680. TrainUrl: req.TrainUrl,
  681. BranchName: req.BranchName,
  682. Parameters: req.Params,
  683. BootFile: req.BootFile,
  684. DataUrl: req.DataUrl,
  685. LogUrl: req.LogUrl,
  686. FlavorCode: req.Spec.SourceSpecId,
  687. Description: req.Description,
  688. WorkServerNumber: req.WorkServerNumber,
  689. FlavorName: req.FlavorName,
  690. EngineName: req.EngineName,
  691. LabelName: req.LabelName,
  692. IsLatestVersion: req.IsLatestVersion,
  693. ComputeResource: models.NPUResource,
  694. VersionCount: req.VersionCount,
  695. TotalVersionCount: req.TotalVersionCount,
  696. ModelName: req.ModelName,
  697. ModelVersion: req.ModelVersion,
  698. CkptName: req.CkptName,
  699. ResultUrl: req.ResultUrl,
  700. CreatedUnix: createTime,
  701. UpdatedUnix: createTime,
  702. Spec: req.Spec,
  703. })
  704. if err != nil {
  705. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  706. return err
  707. }
  708. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  709. return nil
  710. }
  711. func GetNotebookImageName(imageId string) (string, error) {
  712. var validImage = false
  713. var imageName = ""
  714. if ImageInfos == nil {
  715. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  716. }
  717. for _, imageInfo := range ImageInfos.ImageInfo {
  718. if imageInfo.Id == imageId {
  719. validImage = true
  720. imageName = imageInfo.Value
  721. }
  722. }
  723. if !validImage {
  724. log.Error("the image id(%s) is invalid", imageId)
  725. return imageName, errors.New("the image id is invalid")
  726. }
  727. return imageName, nil
  728. }
  729. func InitSpecialPool() {
  730. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  731. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  732. }
  733. }
  734. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  735. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  736. if err != nil {
  737. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  738. return err
  739. }
  740. if result != nil {
  741. oldStatus := task.Status
  742. task.Status = TransTrainJobStatus(result.IntStatus)
  743. task.Duration = result.Duration / 1000
  744. task.TrainJobDuration = result.TrainJobDuration
  745. if task.StartTime == 0 && result.StartTime > 0 {
  746. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  747. }
  748. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  749. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  750. task.EndTime = task.StartTime.Add(task.Duration)
  751. }
  752. task.CorrectCreateUnix()
  753. if oldStatus != task.Status {
  754. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  755. }
  756. err = models.UpdateJob(task)
  757. if err != nil {
  758. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  759. return err
  760. }
  761. }
  762. return nil
  763. }
  764. func HandleNotebookInfo(task *models.Cloudbrain) error {
  765. result, err := GetNotebook2(task.JobID)
  766. if err != nil {
  767. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  768. return err
  769. }
  770. if result != nil {
  771. oldStatus := task.Status
  772. task.Status = result.Status
  773. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  774. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  775. }
  776. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  777. task.EndTime = timeutil.TimeStampNow()
  778. }
  779. task.CorrectCreateUnix()
  780. task.ComputeAndSetDuration()
  781. if oldStatus != task.Status {
  782. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  783. }
  784. if task.FlavorCode == "" {
  785. task.FlavorCode = result.Flavor
  786. }
  787. err = models.UpdateJob(task)
  788. if err != nil {
  789. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  790. return err
  791. }
  792. }
  793. return nil
  794. }
  795. func SyncTempStatusJob() {
  796. jobs, err := models.GetCloudBrainTempJobs()
  797. if err != nil {
  798. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  799. return
  800. }
  801. for _, temp := range jobs {
  802. log.Info("start to handle record: %s", temp.JobName)
  803. if temp.Type == models.TypeCloudBrainTwo {
  804. if temp.JobType == string(models.JobTypeDebug) {
  805. err = handleNotebook(temp)
  806. if err != nil {
  807. log.Error("handleNotebook falied:%v", err)
  808. break
  809. }
  810. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  811. _, err = models.GetCloudbrainByJobID(temp.JobID)
  812. if err != nil {
  813. //one version
  814. err = handleTrainJob(temp)
  815. if err != nil {
  816. log.Error("handleTrainJob falied:%v", err)
  817. break
  818. }
  819. } else {
  820. //multi version
  821. err = handleTrainJobMultiVersion(temp)
  822. if err != nil {
  823. log.Error("handleTrainJobMultiVersion falied:%v", err)
  824. break
  825. }
  826. }
  827. }
  828. }
  829. }
  830. return
  831. }
  832. func handleNotebook(temp *models.CloudbrainTemp) error {
  833. if temp.Status == models.TempJobStatus {
  834. err := handleTempNotebook(temp)
  835. if err != nil {
  836. log.Error("handleTempNotebook failed:%v", err)
  837. return err
  838. }
  839. } else if temp.Status == string(models.ModelArtsStopping) {
  840. res, err := GetNotebook2(temp.JobID)
  841. if err != nil {
  842. log.Error("GetNotebook2 failed:%v", err)
  843. return err
  844. }
  845. temp.Status = res.Status
  846. if temp.Status == string(models.ModelArtsStopped) {
  847. err = models.UpdateCloudbrainTemp(temp)
  848. if err != nil {
  849. log.Error("UpdateCloudbrainTemp failed:%v", err)
  850. return err
  851. }
  852. _, err := DelNotebook2(temp.JobID)
  853. if err != nil {
  854. log.Error("DelNotebook2 failed:%v", err)
  855. return err
  856. }
  857. temp.Status = string(models.ModelArtsDeleted)
  858. err = models.UpdateCloudbrainTemp(temp)
  859. if err != nil {
  860. log.Error("UpdateCloudbrainTemp failed:%v", err)
  861. return err
  862. }
  863. }
  864. }
  865. return nil
  866. }
  867. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  868. var err error
  869. var isExist bool
  870. for {
  871. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  872. if err != nil {
  873. log.Error("GetNotebookList failed:%v", err)
  874. break
  875. }
  876. temp.QueryTimes++
  877. err = models.UpdateCloudbrainTemp(temp)
  878. if err != nil {
  879. log.Error("UpdateCloudbrainTemp failed:%v", err)
  880. }
  881. if result != nil {
  882. for _, notebook := range result.NotebookList {
  883. if temp.JobID == models.TempJobId {
  884. //new notebook
  885. if notebook.JobName == temp.JobName {
  886. isExist = true
  887. temp.Status = notebook.Status
  888. temp.JobID = notebook.JobID
  889. break
  890. }
  891. } else {
  892. //restart: always can find one record
  893. if notebook.JobName == temp.JobName {
  894. if notebook.Status != string(models.ModelArtsStopped) {
  895. isExist = true
  896. temp.Status = notebook.Status
  897. temp.JobID = notebook.JobID
  898. break
  899. }
  900. }
  901. }
  902. }
  903. if isExist {
  904. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  905. if temp.Status == string(models.ModelArtsCreateFailed) {
  906. err = models.UpdateCloudbrainTemp(temp)
  907. if err != nil {
  908. log.Error("UpdateCloudbrainTemp failed:%v", err)
  909. break
  910. }
  911. _, err := DelNotebook2(temp.JobID)
  912. if err != nil {
  913. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  914. break
  915. }
  916. temp.Status = string(models.ModelArtsDeleted)
  917. } else {
  918. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  919. if err != nil {
  920. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  921. break
  922. }
  923. temp.Status = string(models.ModelArtsStopping)
  924. }
  925. models.UpdateCloudbrainTemp(temp)
  926. } else {
  927. log.Error("can not find the record(%s) till now", temp.JobName)
  928. err = errors.New("not found")
  929. break
  930. }
  931. } else {
  932. log.Error("can not find the record(%s) till now", temp.JobName)
  933. err = errors.New("not found")
  934. break
  935. }
  936. break
  937. }
  938. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  939. log.Info("reach MaxTempQueryTimes, set the job failed")
  940. temp.Status = string(models.ModelArtsTrainJobFailed)
  941. err = models.UpdateCloudbrainTemp(temp)
  942. if err != nil {
  943. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  944. return err
  945. }
  946. }
  947. return err
  948. }
  949. func handleTrainJob(temp *models.CloudbrainTemp) error {
  950. if temp.Status == models.TempJobStatus {
  951. err := handleTempTrainJob(temp)
  952. if err != nil {
  953. log.Error("handleTempTrainJob failed:%v", err)
  954. return err
  955. }
  956. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  957. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  958. if err != nil {
  959. log.Error("GetTrainJob failed:%v", err)
  960. return err
  961. }
  962. temp.Status = TransTrainJobStatus(res.IntStatus)
  963. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  964. err = models.UpdateCloudbrainTemp(temp)
  965. if err != nil {
  966. log.Error("UpdateCloudbrainTemp failed:%v", err)
  967. return err
  968. }
  969. _, err := DelTrainJob(temp.JobID)
  970. if err != nil {
  971. log.Error("DelTrainJob failed:%v", err)
  972. return err
  973. }
  974. temp.Status = string(models.ModelArtsDeleted)
  975. err = models.UpdateCloudbrainTemp(temp)
  976. if err != nil {
  977. log.Error("UpdateCloudbrainTemp failed:%v", err)
  978. return err
  979. }
  980. }
  981. }
  982. return nil
  983. }
  984. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  985. if temp.Status == models.TempJobStatus {
  986. err := handleTempTrainJobMultiVersion(temp)
  987. if err != nil {
  988. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  989. return err
  990. }
  991. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  992. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  993. if err != nil {
  994. log.Error("GetTrainJob failed:%v", err)
  995. return err
  996. }
  997. temp.Status = TransTrainJobStatus(res.IntStatus)
  998. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  999. err = models.UpdateCloudbrainTemp(temp)
  1000. if err != nil {
  1001. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1002. return err
  1003. }
  1004. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1005. if err != nil {
  1006. log.Error("DelTrainJob failed:%v", err)
  1007. return err
  1008. }
  1009. temp.Status = string(models.ModelArtsDeleted)
  1010. err = models.UpdateCloudbrainTemp(temp)
  1011. if err != nil {
  1012. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1013. return err
  1014. }
  1015. }
  1016. }
  1017. return nil
  1018. }
  1019. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1020. var err error
  1021. var isExist bool
  1022. for {
  1023. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1024. if err != nil {
  1025. log.Error("GetTrainJobVersionList failed:%v", err)
  1026. break
  1027. }
  1028. temp.QueryTimes++
  1029. err = models.UpdateCloudbrainTemp(temp)
  1030. if err != nil {
  1031. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1032. }
  1033. if result != nil {
  1034. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1035. if result.VersionCount == int64(count+1) {
  1036. isExist = true
  1037. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1038. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1039. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1040. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1041. if err != nil {
  1042. log.Error("StopTrainJob failed:%v", err)
  1043. break
  1044. }
  1045. temp.Status = string(models.ModelArtsTrainJobKilling)
  1046. err = models.UpdateCloudbrainTemp(temp)
  1047. if err != nil {
  1048. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1049. break
  1050. }
  1051. } else {
  1052. log.Error("can not find the record(%s) till now", temp.JobName)
  1053. err = errors.New("not found")
  1054. break
  1055. }
  1056. }
  1057. break
  1058. }
  1059. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1060. log.Info("reach MaxTempQueryTimes, set the job failed")
  1061. temp.Status = string(models.ModelArtsTrainJobFailed)
  1062. err = models.UpdateCloudbrainTemp(temp)
  1063. if err != nil {
  1064. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1065. return err
  1066. }
  1067. }
  1068. return err
  1069. }
  1070. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1071. var err error
  1072. var isExist bool
  1073. for {
  1074. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1075. if err != nil {
  1076. log.Error("GetTrainJobList failed:%v", err)
  1077. break
  1078. }
  1079. temp.QueryTimes++
  1080. err = models.UpdateCloudbrainTemp(temp)
  1081. if err != nil {
  1082. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1083. }
  1084. if result != nil {
  1085. for _, job := range result.JobList {
  1086. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1087. isExist = true
  1088. temp.Status = TransTrainJobStatus(job.IntStatus)
  1089. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1090. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1091. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1092. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1093. if err != nil {
  1094. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1095. break
  1096. }
  1097. temp.Status = string(models.ModelArtsTrainJobKilling)
  1098. err = models.UpdateCloudbrainTemp(temp)
  1099. if err != nil {
  1100. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1101. break
  1102. }
  1103. }
  1104. }
  1105. if !isExist {
  1106. log.Error("can not find the record(%s) till now", temp.JobName)
  1107. err = errors.New("not found")
  1108. break
  1109. }
  1110. }
  1111. break
  1112. }
  1113. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1114. log.Info("reach MaxTempQueryTimes, set the job failed")
  1115. temp.Status = string(models.ModelArtsTrainJobFailed)
  1116. err = models.UpdateCloudbrainTemp(temp)
  1117. if err != nil {
  1118. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1119. return err
  1120. }
  1121. }
  1122. return err
  1123. }