You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 36 kB

4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. package modelarts
  2. import (
  3. "encoding/base64"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/cloudbrain"
  13. "code.gitea.io/gitea/modules/modelarts_cd"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/log"
  17. "code.gitea.io/gitea/modules/notification"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/timeutil"
  20. )
  21. const (
  22. //notebook
  23. storageTypeOBS = "obs"
  24. autoStopDuration = 4 * 60 * 60
  25. AutoStopDurationMs = 4 * 60 * 60 * 1000
  26. CodePath = "/code/"
  27. OutputPath = "/output/"
  28. ResultPath = "/result/"
  29. LogPath = "/log/"
  30. JobPath = "/job/"
  31. OrderDesc = "desc" //向下查询
  32. OrderAsc = "asc" //向上查询
  33. Lines = 500
  34. TrainUrl = "train_url"
  35. DataUrl = "data_url"
  36. MultiDataUrl = "multi_data_url"
  37. ResultUrl = "result_url"
  38. CkptUrl = "ckpt_url"
  39. DeviceTarget = "device_target"
  40. Ascend = "Ascend"
  41. PerPage = 10
  42. IsLatestVersion = "1"
  43. NotLatestVersion = "0"
  44. VersionCountOne = 1
  45. SortByCreateTime = "create_time"
  46. ConfigTypeCustom = "custom"
  47. TotalVersionCount = 1
  48. )
  49. var (
  50. poolInfos *models.PoolInfos
  51. TrainFlavorInfos *Flavor
  52. SpecialPools *models.SpecialPools
  53. MultiNodeConfig *MultiNodes
  54. )
  55. type GenerateTrainJobReq struct {
  56. JobName string
  57. DisplayJobName string
  58. Uuid string
  59. Description string
  60. CodeObsPath string
  61. BootFile string
  62. BootFileUrl string
  63. DataUrl string
  64. TrainUrl string
  65. LogUrl string
  66. PoolID string
  67. WorkServerNumber int
  68. EngineID int64
  69. Parameters []models.Parameter
  70. CommitID string
  71. IsLatestVersion string
  72. Params string
  73. BranchName string
  74. PreVersionId int64
  75. PreVersionName string
  76. FlavorCode string
  77. FlavorName string
  78. VersionCount int
  79. EngineName string
  80. TotalVersionCount int
  81. UserImageUrl string
  82. UserCommand string
  83. DatasetName string
  84. Spec *models.Specification
  85. ModelName string
  86. LabelName string
  87. CkptName string
  88. ModelVersion string
  89. PreTrainModelUrl string
  90. }
  91. type GenerateInferenceJobReq struct {
  92. JobName string
  93. DisplayJobName string
  94. Uuid string
  95. Description string
  96. CodeObsPath string
  97. BootFile string
  98. BootFileUrl string
  99. DataUrl string
  100. TrainUrl string
  101. LogUrl string
  102. PoolID string
  103. WorkServerNumber int
  104. EngineID int64
  105. Parameters []models.Parameter
  106. CommitID string
  107. Params string
  108. BranchName string
  109. FlavorName string
  110. EngineName string
  111. LabelName string
  112. IsLatestVersion string
  113. VersionCount int
  114. TotalVersionCount int
  115. ModelName string
  116. ModelVersion string
  117. CkptName string
  118. ResultUrl string
  119. Spec *models.Specification
  120. DatasetName string
  121. JobType string
  122. UserImageUrl string
  123. UserCommand string
  124. }
  125. type VersionInfo struct {
  126. Version []struct {
  127. ID int `json:"id"`
  128. Value string `json:"value"`
  129. Url string `json:"url"`
  130. } `json:"version"`
  131. }
  132. type Flavor struct {
  133. Info []struct {
  134. Code string `json:"code"`
  135. Value string `json:"value"`
  136. UnitPrice int64 `json:"unitPrice"`
  137. } `json:"flavor"`
  138. }
  139. type Engine struct {
  140. Info []struct {
  141. ID int `json:"id"`
  142. Value string `json:"value"`
  143. } `json:"engine"`
  144. }
  145. type ResourcePool struct {
  146. Info []struct {
  147. ID string `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"resource_pool"`
  150. }
  151. type MultiNodes struct {
  152. Info []OrgMultiNode `json:"multinode"`
  153. }
  154. type OrgMultiNode struct {
  155. Org string `json:"org"`
  156. Node []int `json:"node"`
  157. }
  158. type Parameters struct {
  159. Parameter []struct {
  160. Label string `json:"label"`
  161. Value string `json:"value"`
  162. } `json:"parameter"`
  163. }
  164. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification, bootFile string,autoStopDurationInMs int64) (string, error) {
  165. if poolInfos == nil {
  166. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  167. }
  168. imageName, err := GetNotebookImageName(imageId)
  169. if err != nil {
  170. log.Error("GetNotebookImageName failed: %v", err.Error())
  171. return "", err
  172. }
  173. createTime := timeutil.TimeStampNow()
  174. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  175. JobName: jobName,
  176. Description: description,
  177. Flavor: spec.SourceSpecId,
  178. Duration: autoStopDurationInMs,
  179. ImageID: imageId,
  180. PoolID: poolInfos.PoolInfo[0].PoolId,
  181. Feature: models.NotebookFeature,
  182. Volume: models.VolumeReq{
  183. Capacity: setting.Capacity,
  184. Category: models.EVSCategory,
  185. Ownership: models.ManagedOwnership,
  186. },
  187. WorkspaceID: "0",
  188. })
  189. if err != nil {
  190. log.Error("createNotebook2 failed: %v", err.Error())
  191. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  192. log.Info("(%s)unknown error, set temp status", displayJobName)
  193. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  194. JobID: models.TempJobId,
  195. VersionID: models.TempVersionId,
  196. Status: models.TempJobStatus,
  197. Type: models.TypeCloudBrainTwo,
  198. JobName: jobName,
  199. JobType: string(models.JobTypeDebug),
  200. })
  201. if errTemp != nil {
  202. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  203. return "", errTemp
  204. }
  205. }
  206. return "", err
  207. }
  208. task := &models.Cloudbrain{
  209. Status: jobResult.Status,
  210. UserID: ctx.User.ID,
  211. RepoID: ctx.Repo.Repository.ID,
  212. JobID: jobResult.ID,
  213. JobName: jobName,
  214. FlavorCode: spec.SourceSpecId,
  215. DisplayJobName: displayJobName,
  216. JobType: string(models.JobTypeDebug),
  217. Type: models.TypeCloudBrainTwo,
  218. Uuid: uuid,
  219. ComputeResource: models.NPUResource,
  220. Image: imageName,
  221. BootFile: bootFile,
  222. Description: description,
  223. CreatedUnix: createTime,
  224. UpdatedUnix: createTime,
  225. Spec: spec,
  226. }
  227. err = models.CreateCloudbrain(task)
  228. if err != nil {
  229. return "", err
  230. }
  231. stringId := strconv.FormatInt(task.ID, 10)
  232. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  233. return jobResult.ID, nil
  234. }
  235. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  236. createTime := timeutil.TimeStampNow()
  237. var jobResult *models.CreateTrainJobResult
  238. var createErr error
  239. if req.EngineID < 0 {
  240. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  241. JobName: req.JobName,
  242. Description: req.Description,
  243. Config: models.UserImageConfig{
  244. WorkServerNum: req.WorkServerNumber,
  245. AppUrl: req.CodeObsPath,
  246. BootFileUrl: req.BootFileUrl,
  247. DataUrl: req.DataUrl,
  248. TrainUrl: req.TrainUrl,
  249. LogUrl: req.LogUrl,
  250. PoolID: req.PoolID,
  251. CreateVersion: true,
  252. Flavor: models.Flavor{
  253. Code: req.Spec.SourceSpecId,
  254. },
  255. Parameter: req.Parameters,
  256. UserImageUrl: req.UserImageUrl,
  257. UserCommand: req.UserCommand,
  258. },
  259. })
  260. } else {
  261. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  262. JobName: req.JobName,
  263. Description: req.Description,
  264. Config: models.Config{
  265. WorkServerNum: req.WorkServerNumber,
  266. AppUrl: req.CodeObsPath,
  267. BootFileUrl: req.BootFileUrl,
  268. DataUrl: req.DataUrl,
  269. EngineID: req.EngineID,
  270. TrainUrl: req.TrainUrl,
  271. LogUrl: req.LogUrl,
  272. PoolID: req.PoolID,
  273. CreateVersion: true,
  274. Flavor: models.Flavor{
  275. Code: req.Spec.SourceSpecId,
  276. },
  277. Parameter: req.Parameters,
  278. },
  279. })
  280. }
  281. if createErr != nil {
  282. log.Error("createTrainJob failed: %v", createErr.Error())
  283. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  284. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  285. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  286. JobID: models.TempJobId,
  287. VersionID: models.TempVersionId,
  288. Status: models.TempJobStatus,
  289. Type: models.TypeCloudBrainTwo,
  290. JobName: req.JobName,
  291. JobType: string(models.JobTypeTrain),
  292. })
  293. if errTemp != nil {
  294. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  295. return "", errTemp
  296. }
  297. }
  298. return "", createErr
  299. }
  300. jobID := strconv.FormatInt(jobResult.JobID, 10)
  301. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  302. Status: TransTrainJobStatus(jobResult.Status),
  303. UserID: ctx.User.ID,
  304. RepoID: ctx.Repo.Repository.ID,
  305. JobID: jobID,
  306. JobName: req.JobName,
  307. DisplayJobName: req.DisplayJobName,
  308. JobType: string(models.JobTypeTrain),
  309. Type: models.TypeCloudBrainTwo,
  310. VersionID: jobResult.VersionID,
  311. VersionName: jobResult.VersionName,
  312. Uuid: req.Uuid,
  313. DatasetName: req.DatasetName,
  314. CommitID: req.CommitID,
  315. IsLatestVersion: req.IsLatestVersion,
  316. ComputeResource: models.NPUResource,
  317. EngineID: req.EngineID,
  318. TrainUrl: req.TrainUrl,
  319. BranchName: req.BranchName,
  320. Parameters: req.Params,
  321. BootFile: req.BootFile,
  322. DataUrl: req.DataUrl,
  323. LogUrl: req.LogUrl,
  324. FlavorCode: req.Spec.SourceSpecId,
  325. Description: req.Description,
  326. WorkServerNumber: req.WorkServerNumber,
  327. FlavorName: req.FlavorName,
  328. EngineName: req.EngineName,
  329. VersionCount: req.VersionCount,
  330. TotalVersionCount: req.TotalVersionCount,
  331. CreatedUnix: createTime,
  332. UpdatedUnix: createTime,
  333. Spec: req.Spec,
  334. ModelName: req.ModelName,
  335. ModelVersion: req.ModelVersion,
  336. LabelName: req.LabelName,
  337. PreTrainModelUrl: req.PreTrainModelUrl,
  338. CkptName: req.CkptName,
  339. })
  340. if createErr != nil {
  341. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  342. return "", createErr
  343. }
  344. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  345. return jobID, nil
  346. }
  347. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  348. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  349. JobName: req.JobName,
  350. Description: req.Description,
  351. Config: models.UserImageConfig{
  352. WorkServerNum: req.WorkServerNumber,
  353. AppUrl: req.CodeObsPath,
  354. BootFileUrl: req.BootFileUrl,
  355. DataUrl: req.DataUrl,
  356. TrainUrl: req.TrainUrl,
  357. LogUrl: req.LogUrl,
  358. PoolID: req.PoolID,
  359. CreateVersion: true,
  360. Flavor: models.Flavor{
  361. Code: req.FlavorCode,
  362. },
  363. Parameter: req.Parameters,
  364. UserImageUrl: req.UserImageUrl,
  365. UserCommand: req.UserCommand,
  366. },
  367. })
  368. }
  369. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  370. createTime := timeutil.TimeStampNow()
  371. var jobResult *models.CreateTrainJobResult
  372. var createErr error
  373. if req.EngineID < 0 {
  374. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  375. Description: req.Description,
  376. Config: models.TrainJobVersionUserImageConfig{
  377. WorkServerNum: req.WorkServerNumber,
  378. AppUrl: req.CodeObsPath,
  379. BootFileUrl: req.BootFileUrl,
  380. DataUrl: req.DataUrl,
  381. TrainUrl: req.TrainUrl,
  382. LogUrl: req.LogUrl,
  383. PoolID: req.PoolID,
  384. Flavor: models.Flavor{
  385. Code: req.Spec.SourceSpecId,
  386. },
  387. Parameter: req.Parameters,
  388. PreVersionId: req.PreVersionId,
  389. UserImageUrl: req.UserImageUrl,
  390. UserCommand: req.UserCommand,
  391. },
  392. }, jobId)
  393. } else {
  394. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  395. Description: req.Description,
  396. Config: models.TrainJobVersionConfig{
  397. WorkServerNum: req.WorkServerNumber,
  398. AppUrl: req.CodeObsPath,
  399. BootFileUrl: req.BootFileUrl,
  400. DataUrl: req.DataUrl,
  401. EngineID: req.EngineID,
  402. TrainUrl: req.TrainUrl,
  403. LogUrl: req.LogUrl,
  404. PoolID: req.PoolID,
  405. Flavor: models.Flavor{
  406. Code: req.Spec.SourceSpecId,
  407. },
  408. Parameter: req.Parameters,
  409. PreVersionId: req.PreVersionId,
  410. },
  411. }, jobId)
  412. }
  413. if createErr != nil {
  414. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  415. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  416. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  417. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  418. JobID: jobId,
  419. VersionID: models.TempVersionId,
  420. Status: models.TempJobStatus,
  421. Type: models.TypeCloudBrainTwo,
  422. JobName: req.JobName,
  423. JobType: string(models.JobTypeTrain),
  424. })
  425. if errTemp != nil {
  426. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  427. return errTemp
  428. }
  429. }
  430. return createErr
  431. }
  432. var jobTypes []string
  433. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  434. repo := ctx.Repo.Repository
  435. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  436. RepoID: repo.ID,
  437. Type: models.TypeCloudBrainTwo,
  438. JobTypes: jobTypes,
  439. JobID: strconv.FormatInt(jobResult.JobID, 10),
  440. })
  441. if createErr != nil {
  442. ctx.ServerError("Cloudbrain", createErr)
  443. return createErr
  444. }
  445. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  446. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  447. Status: TransTrainJobStatus(jobResult.Status),
  448. UserID: ctx.User.ID,
  449. RepoID: ctx.Repo.Repository.ID,
  450. JobID: strconv.FormatInt(jobResult.JobID, 10),
  451. JobName: req.JobName,
  452. DisplayJobName: req.DisplayJobName,
  453. JobType: string(models.JobTypeTrain),
  454. Type: models.TypeCloudBrainTwo,
  455. VersionID: jobResult.VersionID,
  456. VersionName: jobResult.VersionName,
  457. Uuid: req.Uuid,
  458. DatasetName: req.DatasetName,
  459. CommitID: req.CommitID,
  460. IsLatestVersion: req.IsLatestVersion,
  461. PreVersionName: req.PreVersionName,
  462. ComputeResource: models.NPUResource,
  463. EngineID: req.EngineID,
  464. TrainUrl: req.TrainUrl,
  465. BranchName: req.BranchName,
  466. Parameters: req.Params,
  467. BootFile: req.BootFile,
  468. DataUrl: req.DataUrl,
  469. LogUrl: req.LogUrl,
  470. PreVersionId: req.PreVersionId,
  471. FlavorCode: req.Spec.SourceSpecId,
  472. Description: req.Description,
  473. WorkServerNumber: req.WorkServerNumber,
  474. FlavorName: req.FlavorName,
  475. EngineName: req.EngineName,
  476. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  477. VersionCount: VersionListCount + 1,
  478. CreatedUnix: createTime,
  479. UpdatedUnix: createTime,
  480. Spec: req.Spec,
  481. ModelName: req.ModelName,
  482. ModelVersion: req.ModelVersion,
  483. LabelName: req.LabelName,
  484. PreTrainModelUrl: req.PreTrainModelUrl,
  485. CkptName: req.CkptName,
  486. })
  487. if createErr != nil {
  488. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  489. return createErr
  490. }
  491. //将训练任务的上一版本的isLatestVersion设置为"0"
  492. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  493. if createErr != nil {
  494. ctx.ServerError("Update IsLatestVersion failed", createErr)
  495. return createErr
  496. }
  497. return createErr
  498. }
  499. func TransTrainJobStatus(status int) string {
  500. switch status {
  501. case 0:
  502. return "UNKNOWN"
  503. case 1:
  504. return "INIT"
  505. case 2:
  506. return "IMAGE_CREATING"
  507. case 3:
  508. return "IMAGE_FAILED"
  509. case 4:
  510. return "SUBMIT_TRYING"
  511. case 5:
  512. return "SUBMIT_FAILED"
  513. case 6:
  514. return "DELETE_FAILED"
  515. case 7:
  516. return "WAITING"
  517. case 8:
  518. return "RUNNING"
  519. case 9:
  520. return "KILLING"
  521. case 10:
  522. return "COMPLETED"
  523. case 11:
  524. return "FAILED"
  525. case 12:
  526. return "KILLED"
  527. case 13:
  528. return "CANCELED"
  529. case 14:
  530. return "LOST"
  531. case 15:
  532. return "SCALING"
  533. case 16:
  534. return "SUBMIT_MODEL_FAILED"
  535. case 17:
  536. return "DEPLOY_SERVICE_FAILED"
  537. case 18:
  538. return "CHECK_INIT"
  539. case 19:
  540. return "CHECK_RUNNING"
  541. case 20:
  542. return "CHECK_RUNNING_COMPLETED"
  543. case 21:
  544. return "CHECK_FAILED"
  545. default:
  546. return strconv.Itoa(status)
  547. }
  548. }
  549. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  550. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  551. VersionOutputPath = "V" + talVersionCountToString
  552. return VersionOutputPath
  553. }
  554. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  555. createTime := timeutil.TimeStampNow()
  556. var jobResult *models.CreateTrainJobResult
  557. var createErr error
  558. if req.EngineID < 0 {
  559. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  560. JobName: req.JobName,
  561. Description: req.Description,
  562. Config: models.InfUserImageConfig{
  563. WorkServerNum: req.WorkServerNumber,
  564. AppUrl: req.CodeObsPath,
  565. BootFileUrl: req.BootFileUrl,
  566. DataUrl: req.DataUrl,
  567. // TrainUrl: req.TrainUrl,
  568. LogUrl: req.LogUrl,
  569. PoolID: req.PoolID,
  570. CreateVersion: true,
  571. Flavor: models.Flavor{
  572. Code: req.Spec.SourceSpecId,
  573. },
  574. Parameter: req.Parameters,
  575. UserImageUrl: req.UserImageUrl,
  576. UserCommand: req.UserCommand,
  577. },
  578. })
  579. } else {
  580. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  581. JobName: req.JobName,
  582. Description: req.Description,
  583. InfConfig: models.InfConfig{
  584. WorkServerNum: req.WorkServerNumber,
  585. AppUrl: req.CodeObsPath,
  586. BootFileUrl: req.BootFileUrl,
  587. DataUrl: req.DataUrl,
  588. EngineID: req.EngineID,
  589. // TrainUrl: req.TrainUrl,
  590. LogUrl: req.LogUrl,
  591. PoolID: req.PoolID,
  592. CreateVersion: true,
  593. Flavor: models.Flavor{
  594. Code: req.Spec.SourceSpecId,
  595. },
  596. Parameter: req.Parameters,
  597. },
  598. })
  599. }
  600. if createErr != nil {
  601. log.Error("createInferenceJob failed: %v", err.Error())
  602. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  603. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  604. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  605. JobID: models.TempJobId,
  606. VersionID: models.TempVersionId,
  607. Status: models.TempJobStatus,
  608. Type: models.TypeCloudBrainTwo,
  609. JobName: req.JobName,
  610. JobType: req.JobType,
  611. })
  612. if err != nil {
  613. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  614. return "", err
  615. }
  616. }
  617. return "", err
  618. }
  619. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  620. // if err != nil {
  621. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  622. // return err
  623. // }
  624. jobID := strconv.FormatInt(jobResult.JobID, 10)
  625. err = models.CreateCloudbrain(&models.Cloudbrain{
  626. Status: TransTrainJobStatus(jobResult.Status),
  627. UserID: ctx.User.ID,
  628. RepoID: ctx.Repo.Repository.ID,
  629. JobID: jobID,
  630. JobName: req.JobName,
  631. DisplayJobName: req.DisplayJobName,
  632. JobType: req.JobType,
  633. Type: models.TypeCloudBrainTwo,
  634. VersionID: jobResult.VersionID,
  635. VersionName: jobResult.VersionName,
  636. Uuid: req.Uuid,
  637. DatasetName: req.DatasetName,
  638. CommitID: req.CommitID,
  639. EngineID: req.EngineID,
  640. TrainUrl: req.TrainUrl,
  641. BranchName: req.BranchName,
  642. Parameters: req.Params,
  643. BootFile: req.BootFile,
  644. DataUrl: req.DataUrl,
  645. LogUrl: req.LogUrl,
  646. FlavorCode: req.Spec.SourceSpecId,
  647. Description: req.Description,
  648. WorkServerNumber: req.WorkServerNumber,
  649. FlavorName: req.FlavorName,
  650. EngineName: req.EngineName,
  651. LabelName: req.LabelName,
  652. IsLatestVersion: req.IsLatestVersion,
  653. ComputeResource: models.NPUResource,
  654. VersionCount: req.VersionCount,
  655. TotalVersionCount: req.TotalVersionCount,
  656. ModelName: req.ModelName,
  657. ModelVersion: req.ModelVersion,
  658. CkptName: req.CkptName,
  659. ResultUrl: req.ResultUrl,
  660. CreatedUnix: createTime,
  661. UpdatedUnix: createTime,
  662. Spec: req.Spec,
  663. })
  664. if err != nil {
  665. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  666. return "", err
  667. }
  668. if req.JobType == string(models.JobTypeModelSafety) {
  669. task, err := models.GetCloudbrainByJobID(jobID)
  670. if err == nil {
  671. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  672. }
  673. } else {
  674. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  675. }
  676. return jobID, nil
  677. }
  678. func GetNotebookImageName(imageId string) (string, error) {
  679. var validImage = false
  680. var imageName = ""
  681. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  682. if imageInfo.Id == imageId {
  683. validImage = true
  684. imageName = imageInfo.Value
  685. }
  686. }
  687. if !validImage {
  688. log.Error("the image id(%s) is invalid", imageId)
  689. return imageName, errors.New("the image id is invalid")
  690. }
  691. return imageName, nil
  692. }
  693. func InitSpecialPool() {
  694. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  695. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  696. }
  697. }
  698. func InitMultiNode() {
  699. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  700. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  701. }
  702. }
  703. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  704. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  705. if err != nil {
  706. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  707. return err
  708. }
  709. if result != nil {
  710. oldStatus := task.Status
  711. task.Status = TransTrainJobStatus(result.IntStatus)
  712. task.Duration = result.Duration / 1000
  713. task.TrainJobDuration = result.TrainJobDuration
  714. if task.StartTime == 0 && result.StartTime > 0 {
  715. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  716. }
  717. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  718. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  719. task.EndTime = task.StartTime.Add(task.Duration)
  720. }
  721. task.CorrectCreateUnix()
  722. if oldStatus != task.Status {
  723. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  724. }
  725. err = models.UpdateJob(task)
  726. if err != nil {
  727. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  728. return err
  729. }
  730. }
  731. return nil
  732. }
  733. func HandleNotebookInfo(task *models.Cloudbrain) error {
  734. var result *models.GetNotebook2Result
  735. var err error
  736. if task.Type == models.TypeCloudBrainTwo {
  737. result, err = GetNotebook2(task.JobID)
  738. } else if task.Type == models.TypeCDCenter {
  739. result, err = modelarts_cd.GetNotebook(task.JobID)
  740. }
  741. if err != nil {
  742. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  743. return err
  744. }
  745. if result != nil {
  746. oldStatus := task.Status
  747. task.Status = result.Status
  748. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  749. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  750. }
  751. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  752. task.EndTime = timeutil.TimeStampNow()
  753. }
  754. task.CorrectCreateUnix()
  755. task.ComputeAndSetDuration()
  756. if oldStatus != task.Status {
  757. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  758. }
  759. if task.FlavorCode == "" {
  760. task.FlavorCode = result.Flavor
  761. }
  762. if oldStatus != task.Status && task.Status == string(models.ModelArtsRunning) && task.BootFile != "" {
  763. uploadNoteBookFile(task, result)
  764. }
  765. err = models.UpdateJob(task)
  766. if err != nil {
  767. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  768. return err
  769. }
  770. }
  771. return nil
  772. }
  773. func uploadNoteBookFile(task *models.Cloudbrain, result *models.GetNotebook2Result) {
  774. jupyterUrl := result.Url + "?token=" + result.Token
  775. cookies, xsrf := getCookiesAndCsrf(jupyterUrl)
  776. if xsrf == "" {
  777. log.Error("browser jupyterUrl failed:%v", task.DisplayJobName)
  778. } else {
  779. codePath := setting.JobPath + task.JobName + cloudbrain.CodeMountPath
  780. fileContents, err := ioutil.ReadFile(codePath + "/" + task.BootFile)
  781. if err != nil {
  782. log.Error("read jupyter file failed:%v", task.DisplayJobName, err)
  783. }
  784. base64Content := base64.StdEncoding.EncodeToString(fileContents)
  785. client := getRestyClient()
  786. uploadUrl := getJupyterBaseUrl(result.Url) + "api/contents/" + path.Base(task.BootFile)
  787. res, err := client.R().
  788. SetCookies(cookies).
  789. SetHeader("X-XSRFToken", xsrf).
  790. SetBody(map[string]interface{}{
  791. "type": "file",
  792. "format": "base64",
  793. "name": path.Base(task.BootFile),
  794. "path": path.Base(task.BootFile),
  795. "content": base64Content}).
  796. Put(uploadUrl)
  797. if err != nil {
  798. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  799. } else if res.StatusCode() != http.StatusCreated {
  800. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  801. }
  802. }
  803. }
  804. func getJupyterBaseUrl(url string) string {
  805. jupyterUrlLength := len(url)
  806. baseUrl := url[0 : jupyterUrlLength-len(path.Base(url))]
  807. return baseUrl
  808. }
  809. func getCookiesAndCsrf(jupyterUrl string) ([]*http.Cookie, string) {
  810. log.Info("jupyter url:"+jupyterUrl)
  811. var cookies []*http.Cookie
  812. const retryTimes = 10
  813. for i := 0; i < retryTimes; i++ {
  814. res, err := http.Get(jupyterUrl)
  815. if err != nil {
  816. log.Error("browser jupyterUrl failed.",err)
  817. if i==retryTimes-1{
  818. return cookies, ""
  819. }
  820. } else {
  821. cookies = res.Cookies()
  822. xsrf := ""
  823. for _, cookie := range cookies {
  824. if cookie.Name == "_xsrf" {
  825. xsrf = cookie.Value
  826. break
  827. }
  828. }
  829. if xsrf != "" {
  830. return cookies, xsrf
  831. }
  832. }
  833. }
  834. return cookies, ""
  835. }
  836. func SyncTempStatusJob() {
  837. jobs, err := models.GetCloudBrainTempJobs()
  838. if err != nil {
  839. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  840. return
  841. }
  842. for _, temp := range jobs {
  843. log.Info("start to handle record: %s", temp.JobName)
  844. if temp.Type == models.TypeCloudBrainTwo {
  845. if temp.JobType == string(models.JobTypeDebug) {
  846. err = handleNotebook(temp)
  847. if err != nil {
  848. log.Error("handleNotebook falied:%v", err)
  849. break
  850. }
  851. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  852. _, err = models.GetCloudbrainByJobID(temp.JobID)
  853. if err != nil {
  854. //one version
  855. err = handleTrainJob(temp)
  856. if err != nil {
  857. log.Error("handleTrainJob falied:%v", err)
  858. break
  859. }
  860. } else {
  861. //multi version
  862. err = handleTrainJobMultiVersion(temp)
  863. if err != nil {
  864. log.Error("handleTrainJobMultiVersion falied:%v", err)
  865. break
  866. }
  867. }
  868. }
  869. }
  870. }
  871. return
  872. }
  873. func handleNotebook(temp *models.CloudbrainTemp) error {
  874. if temp.Status == models.TempJobStatus {
  875. err := handleTempNotebook(temp)
  876. if err != nil {
  877. log.Error("handleTempNotebook failed:%v", err)
  878. return err
  879. }
  880. } else if temp.Status == string(models.ModelArtsStopping) {
  881. res, err := GetNotebook2(temp.JobID)
  882. if err != nil {
  883. log.Error("GetNotebook2 failed:%v", err)
  884. return err
  885. }
  886. temp.Status = res.Status
  887. if temp.Status == string(models.ModelArtsStopped) {
  888. err = models.UpdateCloudbrainTemp(temp)
  889. if err != nil {
  890. log.Error("UpdateCloudbrainTemp failed:%v", err)
  891. return err
  892. }
  893. _, err := DelNotebook2(temp.JobID)
  894. if err != nil {
  895. log.Error("DelNotebook2 failed:%v", err)
  896. return err
  897. }
  898. temp.Status = string(models.ModelArtsDeleted)
  899. err = models.UpdateCloudbrainTemp(temp)
  900. if err != nil {
  901. log.Error("UpdateCloudbrainTemp failed:%v", err)
  902. return err
  903. }
  904. }
  905. }
  906. return nil
  907. }
  908. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  909. var err error
  910. var isExist bool
  911. for {
  912. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  913. if err != nil {
  914. log.Error("GetNotebookList failed:%v", err)
  915. break
  916. }
  917. temp.QueryTimes++
  918. err = models.UpdateCloudbrainTemp(temp)
  919. if err != nil {
  920. log.Error("UpdateCloudbrainTemp failed:%v", err)
  921. }
  922. if result != nil {
  923. for _, notebook := range result.NotebookList {
  924. if temp.JobID == models.TempJobId {
  925. //new notebook
  926. if notebook.JobName == temp.JobName {
  927. isExist = true
  928. temp.Status = notebook.Status
  929. temp.JobID = notebook.JobID
  930. break
  931. }
  932. } else {
  933. //restart: always can find one record
  934. if notebook.JobName == temp.JobName {
  935. if notebook.Status != string(models.ModelArtsStopped) {
  936. isExist = true
  937. temp.Status = notebook.Status
  938. temp.JobID = notebook.JobID
  939. break
  940. }
  941. }
  942. }
  943. }
  944. if isExist {
  945. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  946. if temp.Status == string(models.ModelArtsCreateFailed) {
  947. err = models.UpdateCloudbrainTemp(temp)
  948. if err != nil {
  949. log.Error("UpdateCloudbrainTemp failed:%v", err)
  950. break
  951. }
  952. _, err := DelNotebook2(temp.JobID)
  953. if err != nil {
  954. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  955. break
  956. }
  957. temp.Status = string(models.ModelArtsDeleted)
  958. } else {
  959. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  960. if err != nil {
  961. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  962. break
  963. }
  964. temp.Status = string(models.ModelArtsStopping)
  965. }
  966. models.UpdateCloudbrainTemp(temp)
  967. } else {
  968. log.Error("can not find the record(%s) till now", temp.JobName)
  969. err = errors.New("not found")
  970. break
  971. }
  972. } else {
  973. log.Error("can not find the record(%s) till now", temp.JobName)
  974. err = errors.New("not found")
  975. break
  976. }
  977. break
  978. }
  979. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  980. log.Info("reach MaxTempQueryTimes, set the job failed")
  981. temp.Status = string(models.ModelArtsTrainJobFailed)
  982. err = models.UpdateCloudbrainTemp(temp)
  983. if err != nil {
  984. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  985. return err
  986. }
  987. }
  988. return err
  989. }
  990. func handleTrainJob(temp *models.CloudbrainTemp) error {
  991. if temp.Status == models.TempJobStatus {
  992. err := handleTempTrainJob(temp)
  993. if err != nil {
  994. log.Error("handleTempTrainJob failed:%v", err)
  995. return err
  996. }
  997. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  998. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  999. if err != nil {
  1000. log.Error("GetTrainJob failed:%v", err)
  1001. return err
  1002. }
  1003. temp.Status = TransTrainJobStatus(res.IntStatus)
  1004. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1005. err = models.UpdateCloudbrainTemp(temp)
  1006. if err != nil {
  1007. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1008. return err
  1009. }
  1010. _, err := DelTrainJob(temp.JobID)
  1011. if err != nil {
  1012. log.Error("DelTrainJob failed:%v", err)
  1013. return err
  1014. }
  1015. temp.Status = string(models.ModelArtsDeleted)
  1016. err = models.UpdateCloudbrainTemp(temp)
  1017. if err != nil {
  1018. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1019. return err
  1020. }
  1021. }
  1022. }
  1023. return nil
  1024. }
  1025. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1026. if temp.Status == models.TempJobStatus {
  1027. err := handleTempTrainJobMultiVersion(temp)
  1028. if err != nil {
  1029. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1030. return err
  1031. }
  1032. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1033. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1034. if err != nil {
  1035. log.Error("GetTrainJob failed:%v", err)
  1036. return err
  1037. }
  1038. temp.Status = TransTrainJobStatus(res.IntStatus)
  1039. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1040. err = models.UpdateCloudbrainTemp(temp)
  1041. if err != nil {
  1042. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1043. return err
  1044. }
  1045. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1046. if err != nil {
  1047. log.Error("DelTrainJob failed:%v", err)
  1048. return err
  1049. }
  1050. temp.Status = string(models.ModelArtsDeleted)
  1051. err = models.UpdateCloudbrainTemp(temp)
  1052. if err != nil {
  1053. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1054. return err
  1055. }
  1056. }
  1057. }
  1058. return nil
  1059. }
  1060. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1061. var err error
  1062. var isExist bool
  1063. for {
  1064. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1065. if err != nil {
  1066. log.Error("GetTrainJobVersionList failed:%v", err)
  1067. break
  1068. }
  1069. temp.QueryTimes++
  1070. err = models.UpdateCloudbrainTemp(temp)
  1071. if err != nil {
  1072. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1073. }
  1074. if result != nil {
  1075. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1076. if result.VersionCount == int64(count+1) {
  1077. isExist = true
  1078. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1079. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1080. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1081. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1082. if err != nil {
  1083. log.Error("StopTrainJob failed:%v", err)
  1084. break
  1085. }
  1086. temp.Status = string(models.ModelArtsTrainJobKilling)
  1087. err = models.UpdateCloudbrainTemp(temp)
  1088. if err != nil {
  1089. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1090. break
  1091. }
  1092. } else {
  1093. log.Error("can not find the record(%s) till now", temp.JobName)
  1094. err = errors.New("not found")
  1095. break
  1096. }
  1097. }
  1098. break
  1099. }
  1100. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1101. log.Info("reach MaxTempQueryTimes, set the job failed")
  1102. temp.Status = string(models.ModelArtsTrainJobFailed)
  1103. err = models.UpdateCloudbrainTemp(temp)
  1104. if err != nil {
  1105. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1106. return err
  1107. }
  1108. }
  1109. return err
  1110. }
  1111. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1112. var err error
  1113. var isExist bool
  1114. for {
  1115. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1116. if err != nil {
  1117. log.Error("GetTrainJobList failed:%v", err)
  1118. break
  1119. }
  1120. temp.QueryTimes++
  1121. err = models.UpdateCloudbrainTemp(temp)
  1122. if err != nil {
  1123. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1124. }
  1125. if result != nil {
  1126. for _, job := range result.JobList {
  1127. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1128. isExist = true
  1129. temp.Status = TransTrainJobStatus(job.IntStatus)
  1130. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1131. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1132. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1133. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1134. if err != nil {
  1135. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1136. break
  1137. }
  1138. temp.Status = string(models.ModelArtsTrainJobKilling)
  1139. err = models.UpdateCloudbrainTemp(temp)
  1140. if err != nil {
  1141. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1142. break
  1143. }
  1144. }
  1145. }
  1146. if !isExist {
  1147. log.Error("can not find the record(%s) till now", temp.JobName)
  1148. err = errors.New("not found")
  1149. break
  1150. }
  1151. }
  1152. break
  1153. }
  1154. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1155. log.Info("reach MaxTempQueryTimes, set the job failed")
  1156. temp.Status = string(models.ModelArtsTrainJobFailed)
  1157. err = models.UpdateCloudbrainTemp(temp)
  1158. if err != nil {
  1159. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1160. return err
  1161. }
  1162. }
  1163. return err
  1164. }