You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 36 kB

4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247
  1. package modelarts
  2. import (
  3. "code.gitea.io/gitea/modules/modelarts_cd"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/storage"
  16. "code.gitea.io/gitea/modules/timeutil"
  17. )
  18. const (
  19. //notebook
  20. storageTypeOBS = "obs"
  21. autoStopDuration = 4 * 60 * 60
  22. autoStopDurationMs = 4 * 60 * 60 * 1000
  23. MORDELART_USER_IMAGE_ENGINE_ID = -1
  24. DataSetMountPath = "/home/ma-user/work"
  25. NotebookEnv = "Python3"
  26. NotebookType = "Ascend"
  27. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  28. //train-job
  29. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  30. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  31. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  35. // "]}"
  36. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  40. // "]}"
  41. CodePath = "/code/"
  42. OutputPath = "/output/"
  43. ResultPath = "/result/"
  44. LogPath = "/log/"
  45. JobPath = "/job/"
  46. OrderDesc = "desc" //向下查询
  47. OrderAsc = "asc" //向上查询
  48. Lines = 500
  49. TrainUrl = "train_url"
  50. DataUrl = "data_url"
  51. MultiDataUrl = "multi_data_url"
  52. ResultUrl = "result_url"
  53. CkptUrl = "ckpt_url"
  54. DeviceTarget = "device_target"
  55. Ascend = "Ascend"
  56. PerPage = 10
  57. IsLatestVersion = "1"
  58. NotLatestVersion = "0"
  59. VersionCountOne = 1
  60. SortByCreateTime = "create_time"
  61. ConfigTypeCustom = "custom"
  62. TotalVersionCount = 1
  63. )
  64. var (
  65. poolInfos *models.PoolInfos
  66. TrainFlavorInfos *Flavor
  67. SpecialPools *models.SpecialPools
  68. MultiNodeConfig *MultiNodes
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. LogUrl string
  81. PoolID string
  82. WorkServerNumber int
  83. EngineID int64
  84. Parameters []models.Parameter
  85. CommitID string
  86. IsLatestVersion string
  87. Params string
  88. BranchName string
  89. PreVersionId int64
  90. PreVersionName string
  91. FlavorCode string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. Spec *models.Specification
  100. }
  101. type GenerateInferenceJobReq struct {
  102. JobName string
  103. DisplayJobName string
  104. Uuid string
  105. Description string
  106. CodeObsPath string
  107. BootFile string
  108. BootFileUrl string
  109. DataUrl string
  110. TrainUrl string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. Spec *models.Specification
  130. DatasetName string
  131. }
  132. type VersionInfo struct {
  133. Version []struct {
  134. ID int `json:"id"`
  135. Value string `json:"value"`
  136. Url string `json:"url"`
  137. } `json:"version"`
  138. }
  139. type Flavor struct {
  140. Info []struct {
  141. Code string `json:"code"`
  142. Value string `json:"value"`
  143. } `json:"flavor"`
  144. }
  145. type Engine struct {
  146. Info []struct {
  147. ID int `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"engine"`
  150. }
  151. type ResourcePool struct {
  152. Info []struct {
  153. ID string `json:"id"`
  154. Value string `json:"value"`
  155. } `json:"resource_pool"`
  156. }
  157. type MultiNodes struct{
  158. Info []OrgMultiNode `json:"multinode"`
  159. }
  160. type OrgMultiNode struct{
  161. Org string `json:"org"`
  162. Node []int `json:"node"`
  163. }
  164. // type Parameter struct {
  165. // Label string `json:"label"`
  166. // Value string `json:"value"`
  167. // }
  168. // type Parameters struct {
  169. // Parameter []Parameter `json:"parameter"`
  170. // }
  171. type Parameters struct {
  172. Parameter []struct {
  173. Label string `json:"label"`
  174. Value string `json:"value"`
  175. } `json:"parameter"`
  176. }
  177. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  178. var dataActualPath string
  179. if uuid != "" {
  180. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  181. } else {
  182. userPath := setting.UserBasePath + ctx.User.Name + "/"
  183. isExist, err := storage.ObsHasObject(userPath)
  184. if err != nil {
  185. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  186. return err
  187. }
  188. if !isExist {
  189. if err = storage.ObsCreateObject(userPath); err != nil {
  190. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  191. return err
  192. }
  193. }
  194. dataActualPath = setting.Bucket + "/" + userPath
  195. }
  196. if poolInfos == nil {
  197. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  198. }
  199. createTime := timeutil.TimeStampNow()
  200. jobResult, err := CreateJob(models.CreateNotebookParams{
  201. JobName: jobName,
  202. Description: description,
  203. ProfileID: setting.ProfileID,
  204. Flavor: flavor,
  205. Pool: models.Pool{
  206. ID: poolInfos.PoolInfo[0].PoolId,
  207. Name: poolInfos.PoolInfo[0].PoolName,
  208. Type: poolInfos.PoolInfo[0].PoolType,
  209. },
  210. Spec: models.Spec{
  211. Storage: models.Storage{
  212. Type: storageTypeOBS,
  213. Location: models.Location{
  214. Path: dataActualPath,
  215. },
  216. },
  217. AutoStop: models.AutoStop{
  218. Enable: true,
  219. Duration: autoStopDuration,
  220. },
  221. },
  222. })
  223. if err != nil {
  224. log.Error("CreateJob failed: %v", err.Error())
  225. return err
  226. }
  227. err = models.CreateCloudbrain(&models.Cloudbrain{
  228. Status: string(models.JobWaiting),
  229. UserID: ctx.User.ID,
  230. RepoID: ctx.Repo.Repository.ID,
  231. JobID: jobResult.ID,
  232. JobName: jobName,
  233. JobType: string(models.JobTypeDebug),
  234. Type: models.TypeCloudBrainTwo,
  235. Uuid: uuid,
  236. ComputeResource: models.NPUResource,
  237. CreatedUnix: createTime,
  238. UpdatedUnix: createTime,
  239. })
  240. if err != nil {
  241. return err
  242. }
  243. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  244. return nil
  245. }
  246. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  247. if poolInfos == nil {
  248. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  249. }
  250. imageName, err := GetNotebookImageName(imageId)
  251. if err != nil {
  252. log.Error("GetNotebookImageName failed: %v", err.Error())
  253. return err
  254. }
  255. createTime := timeutil.TimeStampNow()
  256. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  257. JobName: jobName,
  258. Description: description,
  259. Flavor: spec.SourceSpecId,
  260. Duration: autoStopDurationMs,
  261. ImageID: imageId,
  262. PoolID: poolInfos.PoolInfo[0].PoolId,
  263. Feature: models.NotebookFeature,
  264. Volume: models.VolumeReq{
  265. Capacity: setting.Capacity,
  266. Category: models.EVSCategory,
  267. Ownership: models.ManagedOwnership,
  268. },
  269. WorkspaceID: "0",
  270. })
  271. if err != nil {
  272. log.Error("createNotebook2 failed: %v", err.Error())
  273. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  274. log.Info("(%s)unknown error, set temp status", displayJobName)
  275. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  276. JobID: models.TempJobId,
  277. VersionID: models.TempVersionId,
  278. Status: models.TempJobStatus,
  279. Type: models.TypeCloudBrainTwo,
  280. JobName: jobName,
  281. JobType: string(models.JobTypeDebug),
  282. })
  283. if errTemp != nil {
  284. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  285. return errTemp
  286. }
  287. }
  288. return err
  289. }
  290. task := &models.Cloudbrain{
  291. Status: jobResult.Status,
  292. UserID: ctx.User.ID,
  293. RepoID: ctx.Repo.Repository.ID,
  294. JobID: jobResult.ID,
  295. JobName: jobName,
  296. FlavorCode: spec.SourceSpecId,
  297. DisplayJobName: displayJobName,
  298. JobType: string(models.JobTypeDebug),
  299. Type: models.TypeCloudBrainTwo,
  300. Uuid: uuid,
  301. ComputeResource: models.NPUResource,
  302. Image: imageName,
  303. Description: description,
  304. CreatedUnix: createTime,
  305. UpdatedUnix: createTime,
  306. Spec: spec,
  307. }
  308. err = models.CreateCloudbrain(task)
  309. if err != nil {
  310. return err
  311. }
  312. stringId := strconv.FormatInt(task.ID, 10)
  313. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  314. return nil
  315. }
  316. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  317. createTime := timeutil.TimeStampNow()
  318. var jobResult *models.CreateTrainJobResult
  319. var createErr error
  320. if req.EngineID < 0 {
  321. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  322. JobName: req.JobName,
  323. Description: req.Description,
  324. Config: models.UserImageConfig{
  325. WorkServerNum: req.WorkServerNumber,
  326. AppUrl: req.CodeObsPath,
  327. BootFileUrl: req.BootFileUrl,
  328. DataUrl: req.DataUrl,
  329. TrainUrl: req.TrainUrl,
  330. LogUrl: req.LogUrl,
  331. PoolID: req.PoolID,
  332. CreateVersion: true,
  333. Flavor: models.Flavor{
  334. Code: req.Spec.SourceSpecId,
  335. },
  336. Parameter: req.Parameters,
  337. UserImageUrl: req.UserImageUrl,
  338. UserCommand: req.UserCommand,
  339. },
  340. })
  341. } else {
  342. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  343. JobName: req.JobName,
  344. Description: req.Description,
  345. Config: models.Config{
  346. WorkServerNum: req.WorkServerNumber,
  347. AppUrl: req.CodeObsPath,
  348. BootFileUrl: req.BootFileUrl,
  349. DataUrl: req.DataUrl,
  350. EngineID: req.EngineID,
  351. TrainUrl: req.TrainUrl,
  352. LogUrl: req.LogUrl,
  353. PoolID: req.PoolID,
  354. CreateVersion: true,
  355. Flavor: models.Flavor{
  356. Code: req.Spec.SourceSpecId,
  357. },
  358. Parameter: req.Parameters,
  359. },
  360. })
  361. }
  362. if createErr != nil {
  363. log.Error("createTrainJob failed: %v", createErr.Error())
  364. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  365. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  366. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  367. JobID: models.TempJobId,
  368. VersionID: models.TempVersionId,
  369. Status: models.TempJobStatus,
  370. Type: models.TypeCloudBrainTwo,
  371. JobName: req.JobName,
  372. JobType: string(models.JobTypeTrain),
  373. })
  374. if errTemp != nil {
  375. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  376. return errTemp
  377. }
  378. }
  379. return createErr
  380. }
  381. jobId := strconv.FormatInt(jobResult.JobID, 10)
  382. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  383. Status: TransTrainJobStatus(jobResult.Status),
  384. UserID: ctx.User.ID,
  385. RepoID: ctx.Repo.Repository.ID,
  386. JobID: jobId,
  387. JobName: req.JobName,
  388. DisplayJobName: req.DisplayJobName,
  389. JobType: string(models.JobTypeTrain),
  390. Type: models.TypeCloudBrainTwo,
  391. VersionID: jobResult.VersionID,
  392. VersionName: jobResult.VersionName,
  393. Uuid: req.Uuid,
  394. DatasetName: req.DatasetName,
  395. CommitID: req.CommitID,
  396. IsLatestVersion: req.IsLatestVersion,
  397. ComputeResource: models.NPUResource,
  398. EngineID: req.EngineID,
  399. TrainUrl: req.TrainUrl,
  400. BranchName: req.BranchName,
  401. Parameters: req.Params,
  402. BootFile: req.BootFile,
  403. DataUrl: req.DataUrl,
  404. LogUrl: req.LogUrl,
  405. FlavorCode: req.Spec.SourceSpecId,
  406. Description: req.Description,
  407. WorkServerNumber: req.WorkServerNumber,
  408. FlavorName: req.FlavorName,
  409. EngineName: req.EngineName,
  410. VersionCount: req.VersionCount,
  411. TotalVersionCount: req.TotalVersionCount,
  412. CreatedUnix: createTime,
  413. UpdatedUnix: createTime,
  414. Spec: req.Spec,
  415. })
  416. if createErr != nil {
  417. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  418. return createErr
  419. }
  420. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  421. return nil
  422. }
  423. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  424. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  425. JobName: req.JobName,
  426. Description: req.Description,
  427. Config: models.UserImageConfig{
  428. WorkServerNum: req.WorkServerNumber,
  429. AppUrl: req.CodeObsPath,
  430. BootFileUrl: req.BootFileUrl,
  431. DataUrl: req.DataUrl,
  432. TrainUrl: req.TrainUrl,
  433. LogUrl: req.LogUrl,
  434. PoolID: req.PoolID,
  435. CreateVersion: true,
  436. Flavor: models.Flavor{
  437. Code: req.FlavorCode,
  438. },
  439. Parameter: req.Parameters,
  440. UserImageUrl: req.UserImageUrl,
  441. UserCommand: req.UserCommand,
  442. },
  443. })
  444. }
  445. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  446. createTime := timeutil.TimeStampNow()
  447. var jobResult *models.CreateTrainJobResult
  448. var createErr error
  449. if req.EngineID < 0 {
  450. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  451. Description: req.Description,
  452. Config: models.TrainJobVersionUserImageConfig{
  453. WorkServerNum: req.WorkServerNumber,
  454. AppUrl: req.CodeObsPath,
  455. BootFileUrl: req.BootFileUrl,
  456. DataUrl: req.DataUrl,
  457. TrainUrl: req.TrainUrl,
  458. LogUrl: req.LogUrl,
  459. PoolID: req.PoolID,
  460. Flavor: models.Flavor{
  461. Code: req.Spec.SourceSpecId,
  462. },
  463. Parameter: req.Parameters,
  464. PreVersionId: req.PreVersionId,
  465. UserImageUrl: req.UserImageUrl,
  466. UserCommand: req.UserCommand,
  467. },
  468. }, jobId)
  469. } else {
  470. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  471. Description: req.Description,
  472. Config: models.TrainJobVersionConfig{
  473. WorkServerNum: req.WorkServerNumber,
  474. AppUrl: req.CodeObsPath,
  475. BootFileUrl: req.BootFileUrl,
  476. DataUrl: req.DataUrl,
  477. EngineID: req.EngineID,
  478. TrainUrl: req.TrainUrl,
  479. LogUrl: req.LogUrl,
  480. PoolID: req.PoolID,
  481. Flavor: models.Flavor{
  482. Code: req.Spec.SourceSpecId,
  483. },
  484. Parameter: req.Parameters,
  485. PreVersionId: req.PreVersionId,
  486. },
  487. }, jobId)
  488. }
  489. if createErr != nil {
  490. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  491. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  492. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  493. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  494. JobID: jobId,
  495. VersionID: models.TempVersionId,
  496. Status: models.TempJobStatus,
  497. Type: models.TypeCloudBrainTwo,
  498. JobName: req.JobName,
  499. JobType: string(models.JobTypeTrain),
  500. })
  501. if errTemp != nil {
  502. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  503. return errTemp
  504. }
  505. }
  506. return createErr
  507. }
  508. var jobTypes []string
  509. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  510. repo := ctx.Repo.Repository
  511. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  512. RepoID: repo.ID,
  513. Type: models.TypeCloudBrainTwo,
  514. JobTypes: jobTypes,
  515. JobID: strconv.FormatInt(jobResult.JobID, 10),
  516. })
  517. if createErr != nil {
  518. ctx.ServerError("Cloudbrain", createErr)
  519. return createErr
  520. }
  521. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  522. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  523. Status: TransTrainJobStatus(jobResult.Status),
  524. UserID: ctx.User.ID,
  525. RepoID: ctx.Repo.Repository.ID,
  526. JobID: strconv.FormatInt(jobResult.JobID, 10),
  527. JobName: req.JobName,
  528. DisplayJobName: req.DisplayJobName,
  529. JobType: string(models.JobTypeTrain),
  530. Type: models.TypeCloudBrainTwo,
  531. VersionID: jobResult.VersionID,
  532. VersionName: jobResult.VersionName,
  533. Uuid: req.Uuid,
  534. DatasetName: req.DatasetName,
  535. CommitID: req.CommitID,
  536. IsLatestVersion: req.IsLatestVersion,
  537. PreVersionName: req.PreVersionName,
  538. ComputeResource: models.NPUResource,
  539. EngineID: req.EngineID,
  540. TrainUrl: req.TrainUrl,
  541. BranchName: req.BranchName,
  542. Parameters: req.Params,
  543. BootFile: req.BootFile,
  544. DataUrl: req.DataUrl,
  545. LogUrl: req.LogUrl,
  546. PreVersionId: req.PreVersionId,
  547. FlavorCode: req.Spec.SourceSpecId,
  548. Description: req.Description,
  549. WorkServerNumber: req.WorkServerNumber,
  550. FlavorName: req.FlavorName,
  551. EngineName: req.EngineName,
  552. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  553. VersionCount: VersionListCount + 1,
  554. CreatedUnix: createTime,
  555. UpdatedUnix: createTime,
  556. Spec: req.Spec,
  557. })
  558. if createErr != nil {
  559. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  560. return createErr
  561. }
  562. //将训练任务的上一版本的isLatestVersion设置为"0"
  563. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  564. if createErr != nil {
  565. ctx.ServerError("Update IsLatestVersion failed", createErr)
  566. return createErr
  567. }
  568. return createErr
  569. }
  570. func TransTrainJobStatus(status int) string {
  571. switch status {
  572. case 0:
  573. return "UNKNOWN"
  574. case 1:
  575. return "INIT"
  576. case 2:
  577. return "IMAGE_CREATING"
  578. case 3:
  579. return "IMAGE_FAILED"
  580. case 4:
  581. return "SUBMIT_TRYING"
  582. case 5:
  583. return "SUBMIT_FAILED"
  584. case 6:
  585. return "DELETE_FAILED"
  586. case 7:
  587. return "WAITING"
  588. case 8:
  589. return "RUNNING"
  590. case 9:
  591. return "KILLING"
  592. case 10:
  593. return "COMPLETED"
  594. case 11:
  595. return "FAILED"
  596. case 12:
  597. return "KILLED"
  598. case 13:
  599. return "CANCELED"
  600. case 14:
  601. return "LOST"
  602. case 15:
  603. return "SCALING"
  604. case 16:
  605. return "SUBMIT_MODEL_FAILED"
  606. case 17:
  607. return "DEPLOY_SERVICE_FAILED"
  608. case 18:
  609. return "CHECK_INIT"
  610. case 19:
  611. return "CHECK_RUNNING"
  612. case 20:
  613. return "CHECK_RUNNING_COMPLETED"
  614. case 21:
  615. return "CHECK_FAILED"
  616. default:
  617. return strconv.Itoa(status)
  618. }
  619. }
  620. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  621. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  622. VersionOutputPath = "V" + talVersionCountToString
  623. return VersionOutputPath
  624. }
  625. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  626. createTime := timeutil.TimeStampNow()
  627. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  628. JobName: req.JobName,
  629. Description: req.Description,
  630. InfConfig: models.InfConfig{
  631. WorkServerNum: req.WorkServerNumber,
  632. AppUrl: req.CodeObsPath,
  633. BootFileUrl: req.BootFileUrl,
  634. DataUrl: req.DataUrl,
  635. EngineID: req.EngineID,
  636. // TrainUrl: req.TrainUrl,
  637. LogUrl: req.LogUrl,
  638. PoolID: req.PoolID,
  639. CreateVersion: true,
  640. Flavor: models.Flavor{
  641. Code: req.Spec.SourceSpecId,
  642. },
  643. Parameter: req.Parameters,
  644. },
  645. })
  646. if err != nil {
  647. log.Error("createInferenceJob failed: %v", err.Error())
  648. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  649. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  650. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  651. JobID: models.TempJobId,
  652. VersionID: models.TempVersionId,
  653. Status: models.TempJobStatus,
  654. Type: models.TypeCloudBrainTwo,
  655. JobName: req.JobName,
  656. JobType: string(models.JobTypeInference),
  657. })
  658. if err != nil {
  659. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  660. return err
  661. }
  662. }
  663. return err
  664. }
  665. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  666. // if err != nil {
  667. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  668. // return err
  669. // }
  670. jobID := strconv.FormatInt(jobResult.JobID, 10)
  671. err = models.CreateCloudbrain(&models.Cloudbrain{
  672. Status: TransTrainJobStatus(jobResult.Status),
  673. UserID: ctx.User.ID,
  674. RepoID: ctx.Repo.Repository.ID,
  675. JobID: jobID,
  676. JobName: req.JobName,
  677. DisplayJobName: req.DisplayJobName,
  678. JobType: string(models.JobTypeInference),
  679. Type: models.TypeCloudBrainTwo,
  680. VersionID: jobResult.VersionID,
  681. VersionName: jobResult.VersionName,
  682. Uuid: req.Uuid,
  683. DatasetName: req.DatasetName,
  684. CommitID: req.CommitID,
  685. EngineID: req.EngineID,
  686. TrainUrl: req.TrainUrl,
  687. BranchName: req.BranchName,
  688. Parameters: req.Params,
  689. BootFile: req.BootFile,
  690. DataUrl: req.DataUrl,
  691. LogUrl: req.LogUrl,
  692. FlavorCode: req.Spec.SourceSpecId,
  693. Description: req.Description,
  694. WorkServerNumber: req.WorkServerNumber,
  695. FlavorName: req.FlavorName,
  696. EngineName: req.EngineName,
  697. LabelName: req.LabelName,
  698. IsLatestVersion: req.IsLatestVersion,
  699. ComputeResource: models.NPUResource,
  700. VersionCount: req.VersionCount,
  701. TotalVersionCount: req.TotalVersionCount,
  702. ModelName: req.ModelName,
  703. ModelVersion: req.ModelVersion,
  704. CkptName: req.CkptName,
  705. ResultUrl: req.ResultUrl,
  706. CreatedUnix: createTime,
  707. UpdatedUnix: createTime,
  708. Spec: req.Spec,
  709. })
  710. if err != nil {
  711. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  712. return err
  713. }
  714. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  715. return nil
  716. }
  717. func GetNotebookImageName(imageId string) (string, error) {
  718. var validImage = false
  719. var imageName = ""
  720. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  721. if imageInfo.Id == imageId {
  722. validImage = true
  723. imageName = imageInfo.Value
  724. }
  725. }
  726. if !validImage {
  727. log.Error("the image id(%s) is invalid", imageId)
  728. return imageName, errors.New("the image id is invalid")
  729. }
  730. return imageName, nil
  731. }
  732. func InitSpecialPool() {
  733. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  734. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  735. }
  736. }
  737. func InitMultiNode(){
  738. if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{
  739. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  740. }
  741. }
  742. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  743. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  744. if err != nil {
  745. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  746. return err
  747. }
  748. if result != nil {
  749. oldStatus := task.Status
  750. task.Status = TransTrainJobStatus(result.IntStatus)
  751. task.Duration = result.Duration / 1000
  752. task.TrainJobDuration = result.TrainJobDuration
  753. if task.StartTime == 0 && result.StartTime > 0 {
  754. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  755. }
  756. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  757. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  758. task.EndTime = task.StartTime.Add(task.Duration)
  759. }
  760. task.CorrectCreateUnix()
  761. if oldStatus != task.Status {
  762. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  763. }
  764. err = models.UpdateJob(task)
  765. if err != nil {
  766. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  767. return err
  768. }
  769. }
  770. return nil
  771. }
  772. func HandleNotebookInfo(task *models.Cloudbrain) error {
  773. var result *models.GetNotebook2Result
  774. var err error
  775. if task.Type == models.TypeCloudBrainTwo {
  776. result, err = GetNotebook2(task.JobID)
  777. } else if task.Type == models.TypeCDCenter {
  778. result, err = modelarts_cd.GetNotebook(task.JobID)
  779. }
  780. if err != nil {
  781. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  782. return err
  783. }
  784. if result != nil {
  785. oldStatus := task.Status
  786. task.Status = result.Status
  787. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  788. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  789. }
  790. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  791. task.EndTime = timeutil.TimeStampNow()
  792. }
  793. task.CorrectCreateUnix()
  794. task.ComputeAndSetDuration()
  795. if oldStatus != task.Status {
  796. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  797. }
  798. if task.FlavorCode == "" {
  799. task.FlavorCode = result.Flavor
  800. }
  801. err = models.UpdateJob(task)
  802. if err != nil {
  803. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  804. return err
  805. }
  806. }
  807. return nil
  808. }
  809. func SyncTempStatusJob() {
  810. jobs, err := models.GetCloudBrainTempJobs()
  811. if err != nil {
  812. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  813. return
  814. }
  815. for _, temp := range jobs {
  816. log.Info("start to handle record: %s", temp.JobName)
  817. if temp.Type == models.TypeCloudBrainTwo {
  818. if temp.JobType == string(models.JobTypeDebug) {
  819. err = handleNotebook(temp)
  820. if err != nil {
  821. log.Error("handleNotebook falied:%v", err)
  822. break
  823. }
  824. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  825. _, err = models.GetCloudbrainByJobID(temp.JobID)
  826. if err != nil {
  827. //one version
  828. err = handleTrainJob(temp)
  829. if err != nil {
  830. log.Error("handleTrainJob falied:%v", err)
  831. break
  832. }
  833. } else {
  834. //multi version
  835. err = handleTrainJobMultiVersion(temp)
  836. if err != nil {
  837. log.Error("handleTrainJobMultiVersion falied:%v", err)
  838. break
  839. }
  840. }
  841. }
  842. }
  843. }
  844. return
  845. }
  846. func handleNotebook(temp *models.CloudbrainTemp) error {
  847. if temp.Status == models.TempJobStatus {
  848. err := handleTempNotebook(temp)
  849. if err != nil {
  850. log.Error("handleTempNotebook failed:%v", err)
  851. return err
  852. }
  853. } else if temp.Status == string(models.ModelArtsStopping) {
  854. res, err := GetNotebook2(temp.JobID)
  855. if err != nil {
  856. log.Error("GetNotebook2 failed:%v", err)
  857. return err
  858. }
  859. temp.Status = res.Status
  860. if temp.Status == string(models.ModelArtsStopped) {
  861. err = models.UpdateCloudbrainTemp(temp)
  862. if err != nil {
  863. log.Error("UpdateCloudbrainTemp failed:%v", err)
  864. return err
  865. }
  866. _, err := DelNotebook2(temp.JobID)
  867. if err != nil {
  868. log.Error("DelNotebook2 failed:%v", err)
  869. return err
  870. }
  871. temp.Status = string(models.ModelArtsDeleted)
  872. err = models.UpdateCloudbrainTemp(temp)
  873. if err != nil {
  874. log.Error("UpdateCloudbrainTemp failed:%v", err)
  875. return err
  876. }
  877. }
  878. }
  879. return nil
  880. }
  881. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  882. var err error
  883. var isExist bool
  884. for {
  885. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  886. if err != nil {
  887. log.Error("GetNotebookList failed:%v", err)
  888. break
  889. }
  890. temp.QueryTimes++
  891. err = models.UpdateCloudbrainTemp(temp)
  892. if err != nil {
  893. log.Error("UpdateCloudbrainTemp failed:%v", err)
  894. }
  895. if result != nil {
  896. for _, notebook := range result.NotebookList {
  897. if temp.JobID == models.TempJobId {
  898. //new notebook
  899. if notebook.JobName == temp.JobName {
  900. isExist = true
  901. temp.Status = notebook.Status
  902. temp.JobID = notebook.JobID
  903. break
  904. }
  905. } else {
  906. //restart: always can find one record
  907. if notebook.JobName == temp.JobName {
  908. if notebook.Status != string(models.ModelArtsStopped) {
  909. isExist = true
  910. temp.Status = notebook.Status
  911. temp.JobID = notebook.JobID
  912. break
  913. }
  914. }
  915. }
  916. }
  917. if isExist {
  918. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  919. if temp.Status == string(models.ModelArtsCreateFailed) {
  920. err = models.UpdateCloudbrainTemp(temp)
  921. if err != nil {
  922. log.Error("UpdateCloudbrainTemp failed:%v", err)
  923. break
  924. }
  925. _, err := DelNotebook2(temp.JobID)
  926. if err != nil {
  927. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  928. break
  929. }
  930. temp.Status = string(models.ModelArtsDeleted)
  931. } else {
  932. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  933. if err != nil {
  934. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  935. break
  936. }
  937. temp.Status = string(models.ModelArtsStopping)
  938. }
  939. models.UpdateCloudbrainTemp(temp)
  940. } else {
  941. log.Error("can not find the record(%s) till now", temp.JobName)
  942. err = errors.New("not found")
  943. break
  944. }
  945. } else {
  946. log.Error("can not find the record(%s) till now", temp.JobName)
  947. err = errors.New("not found")
  948. break
  949. }
  950. break
  951. }
  952. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  953. log.Info("reach MaxTempQueryTimes, set the job failed")
  954. temp.Status = string(models.ModelArtsTrainJobFailed)
  955. err = models.UpdateCloudbrainTemp(temp)
  956. if err != nil {
  957. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  958. return err
  959. }
  960. }
  961. return err
  962. }
  963. func handleTrainJob(temp *models.CloudbrainTemp) error {
  964. if temp.Status == models.TempJobStatus {
  965. err := handleTempTrainJob(temp)
  966. if err != nil {
  967. log.Error("handleTempTrainJob failed:%v", err)
  968. return err
  969. }
  970. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  971. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  972. if err != nil {
  973. log.Error("GetTrainJob failed:%v", err)
  974. return err
  975. }
  976. temp.Status = TransTrainJobStatus(res.IntStatus)
  977. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  978. err = models.UpdateCloudbrainTemp(temp)
  979. if err != nil {
  980. log.Error("UpdateCloudbrainTemp failed:%v", err)
  981. return err
  982. }
  983. _, err := DelTrainJob(temp.JobID)
  984. if err != nil {
  985. log.Error("DelTrainJob failed:%v", err)
  986. return err
  987. }
  988. temp.Status = string(models.ModelArtsDeleted)
  989. err = models.UpdateCloudbrainTemp(temp)
  990. if err != nil {
  991. log.Error("UpdateCloudbrainTemp failed:%v", err)
  992. return err
  993. }
  994. }
  995. }
  996. return nil
  997. }
  998. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  999. if temp.Status == models.TempJobStatus {
  1000. err := handleTempTrainJobMultiVersion(temp)
  1001. if err != nil {
  1002. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1003. return err
  1004. }
  1005. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1006. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1007. if err != nil {
  1008. log.Error("GetTrainJob failed:%v", err)
  1009. return err
  1010. }
  1011. temp.Status = TransTrainJobStatus(res.IntStatus)
  1012. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1013. err = models.UpdateCloudbrainTemp(temp)
  1014. if err != nil {
  1015. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1016. return err
  1017. }
  1018. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1019. if err != nil {
  1020. log.Error("DelTrainJob failed:%v", err)
  1021. return err
  1022. }
  1023. temp.Status = string(models.ModelArtsDeleted)
  1024. err = models.UpdateCloudbrainTemp(temp)
  1025. if err != nil {
  1026. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1027. return err
  1028. }
  1029. }
  1030. }
  1031. return nil
  1032. }
  1033. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1034. var err error
  1035. var isExist bool
  1036. for {
  1037. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1038. if err != nil {
  1039. log.Error("GetTrainJobVersionList failed:%v", err)
  1040. break
  1041. }
  1042. temp.QueryTimes++
  1043. err = models.UpdateCloudbrainTemp(temp)
  1044. if err != nil {
  1045. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1046. }
  1047. if result != nil {
  1048. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1049. if result.VersionCount == int64(count+1) {
  1050. isExist = true
  1051. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1052. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1053. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1054. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1055. if err != nil {
  1056. log.Error("StopTrainJob failed:%v", err)
  1057. break
  1058. }
  1059. temp.Status = string(models.ModelArtsTrainJobKilling)
  1060. err = models.UpdateCloudbrainTemp(temp)
  1061. if err != nil {
  1062. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1063. break
  1064. }
  1065. } else {
  1066. log.Error("can not find the record(%s) till now", temp.JobName)
  1067. err = errors.New("not found")
  1068. break
  1069. }
  1070. }
  1071. break
  1072. }
  1073. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1074. log.Info("reach MaxTempQueryTimes, set the job failed")
  1075. temp.Status = string(models.ModelArtsTrainJobFailed)
  1076. err = models.UpdateCloudbrainTemp(temp)
  1077. if err != nil {
  1078. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1079. return err
  1080. }
  1081. }
  1082. return err
  1083. }
  1084. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1085. var err error
  1086. var isExist bool
  1087. for {
  1088. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1089. if err != nil {
  1090. log.Error("GetTrainJobList failed:%v", err)
  1091. break
  1092. }
  1093. temp.QueryTimes++
  1094. err = models.UpdateCloudbrainTemp(temp)
  1095. if err != nil {
  1096. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1097. }
  1098. if result != nil {
  1099. for _, job := range result.JobList {
  1100. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1101. isExist = true
  1102. temp.Status = TransTrainJobStatus(job.IntStatus)
  1103. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1104. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1105. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1106. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1107. if err != nil {
  1108. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1109. break
  1110. }
  1111. temp.Status = string(models.ModelArtsTrainJobKilling)
  1112. err = models.UpdateCloudbrainTemp(temp)
  1113. if err != nil {
  1114. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1115. break
  1116. }
  1117. }
  1118. }
  1119. if !isExist {
  1120. log.Error("can not find the record(%s) till now", temp.JobName)
  1121. err = errors.New("not found")
  1122. break
  1123. }
  1124. }
  1125. break
  1126. }
  1127. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1128. log.Info("reach MaxTempQueryTimes, set the job failed")
  1129. temp.Status = string(models.ModelArtsTrainJobFailed)
  1130. err = models.UpdateCloudbrainTemp(temp)
  1131. if err != nil {
  1132. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1133. return err
  1134. }
  1135. }
  1136. return err
  1137. }