You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 36 kB

4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290
  1. package modelarts
  2. import (
  3. "encoding/base64"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/cloudbrain"
  13. "code.gitea.io/gitea/modules/modelarts_cd"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/log"
  17. "code.gitea.io/gitea/modules/notification"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/timeutil"
  20. )
  21. const (
  22. //notebook
  23. storageTypeOBS = "obs"
  24. autoStopDuration = 4 * 60 * 60
  25. AutoStopDurationMs = 4 * 60 * 60 * 1000
  26. CodePath = "/code/"
  27. OutputPath = "/output/"
  28. ResultPath = "/result/"
  29. LogPath = "/log/"
  30. JobPath = "/job/"
  31. OrderDesc = "desc" //向下查询
  32. OrderAsc = "asc" //向上查询
  33. Lines = 500
  34. TrainUrl = "train_url"
  35. DataUrl = "data_url"
  36. MultiDataUrl = "multi_data_url"
  37. ResultUrl = "result_url"
  38. CkptUrl = "ckpt_url"
  39. DeviceTarget = "device_target"
  40. Ascend = "Ascend"
  41. PerPage = 10
  42. IsLatestVersion = "1"
  43. NotLatestVersion = "0"
  44. VersionCountOne = 1
  45. SortByCreateTime = "create_time"
  46. ConfigTypeCustom = "custom"
  47. TotalVersionCount = 1
  48. )
  49. var (
  50. poolInfos *models.PoolInfos
  51. TrainFlavorInfos *Flavor
  52. SpecialPools *models.SpecialPools
  53. MultiNodeConfig *MultiNodes
  54. )
  55. type GenerateTrainJobReq struct {
  56. JobName string
  57. DisplayJobName string
  58. Uuid string
  59. Description string
  60. CodeObsPath string
  61. BootFile string
  62. BootFileUrl string
  63. DataUrl string
  64. TrainUrl string
  65. LogUrl string
  66. PoolID string
  67. WorkServerNumber int
  68. EngineID int64
  69. Parameters []models.Parameter
  70. CommitID string
  71. IsLatestVersion string
  72. Params string
  73. BranchName string
  74. PreVersionId int64
  75. PreVersionName string
  76. FlavorCode string
  77. FlavorName string
  78. VersionCount int
  79. EngineName string
  80. TotalVersionCount int
  81. UserImageUrl string
  82. UserCommand string
  83. DatasetName string
  84. Spec *models.Specification
  85. ModelName string
  86. LabelName string
  87. CkptName string
  88. ModelVersion string
  89. PreTrainModelUrl string
  90. }
  91. type GenerateInferenceJobReq struct {
  92. JobName string
  93. DisplayJobName string
  94. Uuid string
  95. Description string
  96. CodeObsPath string
  97. BootFile string
  98. BootFileUrl string
  99. DataUrl string
  100. TrainUrl string
  101. LogUrl string
  102. PoolID string
  103. WorkServerNumber int
  104. EngineID int64
  105. Parameters []models.Parameter
  106. CommitID string
  107. Params string
  108. BranchName string
  109. FlavorName string
  110. EngineName string
  111. LabelName string
  112. IsLatestVersion string
  113. VersionCount int
  114. TotalVersionCount int
  115. ModelName string
  116. ModelVersion string
  117. CkptName string
  118. ResultUrl string
  119. Spec *models.Specification
  120. DatasetName string
  121. JobType string
  122. UserImageUrl string
  123. UserCommand string
  124. }
  125. type VersionInfo struct {
  126. Version []struct {
  127. ID int `json:"id"`
  128. Value string `json:"value"`
  129. Url string `json:"url"`
  130. } `json:"version"`
  131. }
  132. type Flavor struct {
  133. Info []struct {
  134. Code string `json:"code"`
  135. Value string `json:"value"`
  136. UnitPrice int64 `json:"unitPrice"`
  137. } `json:"flavor"`
  138. }
  139. type Engine struct {
  140. Info []struct {
  141. ID int `json:"id"`
  142. Value string `json:"value"`
  143. } `json:"engine"`
  144. }
  145. type ResourcePool struct {
  146. Info []struct {
  147. ID string `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"resource_pool"`
  150. }
  151. type MultiNodes struct {
  152. Info []OrgMultiNode `json:"multinode"`
  153. }
  154. type OrgMultiNode struct {
  155. Org string `json:"org"`
  156. Node []int `json:"node"`
  157. }
  158. type Parameters struct {
  159. Parameter []struct {
  160. Label string `json:"label"`
  161. Value string `json:"value"`
  162. } `json:"parameter"`
  163. }
  164. func GenerateNotebook2(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) {
  165. if poolInfos == nil {
  166. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  167. }
  168. imageName, err := GetNotebookImageName(req.ImageId)
  169. if err != nil {
  170. log.Error("GetNotebookImageName failed: %v", err.Error())
  171. return "", err
  172. }
  173. createTime := timeutil.TimeStampNow()
  174. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  175. JobName: req.JobName,
  176. Description: req.Description,
  177. Flavor: req.Spec.SourceSpecId,
  178. Duration: req.AutoStopDurationMs,
  179. ImageID: req.ImageId,
  180. PoolID: poolInfos.PoolInfo[0].PoolId,
  181. Feature: models.NotebookFeature,
  182. Volume: models.VolumeReq{
  183. Capacity: setting.Capacity,
  184. Category: models.EVSCategory,
  185. Ownership: models.ManagedOwnership,
  186. },
  187. WorkspaceID: "0",
  188. })
  189. if err != nil {
  190. log.Error("createNotebook2 failed: %v", err.Error())
  191. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  192. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  193. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  194. JobID: models.TempJobId,
  195. VersionID: models.TempVersionId,
  196. Status: models.TempJobStatus,
  197. Type: models.TypeCloudBrainTwo,
  198. JobName: req.JobName,
  199. JobType: string(models.JobTypeDebug),
  200. })
  201. if errTemp != nil {
  202. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  203. return "", errTemp
  204. }
  205. }
  206. return "", err
  207. }
  208. task := &models.Cloudbrain{
  209. Status: jobResult.Status,
  210. UserID: ctx.User.ID,
  211. RepoID: ctx.Repo.Repository.ID,
  212. JobID: jobResult.ID,
  213. JobName: req.JobName,
  214. FlavorCode: req.Spec.SourceSpecId,
  215. DisplayJobName: req.DisplayJobName,
  216. JobType: string(models.JobTypeDebug),
  217. Type: models.TypeCloudBrainTwo,
  218. Uuid: req.Uuid,
  219. ComputeResource: models.NPUResource,
  220. Image: imageName,
  221. BootFile: req.BootFile,
  222. Description: req.Description,
  223. CreatedUnix: createTime,
  224. UpdatedUnix: createTime,
  225. Spec: req.Spec,
  226. ModelName: req.ModelName,
  227. ModelVersion: req.ModelVersion,
  228. LabelName: req.LabelName,
  229. PreTrainModelUrl: req.PreTrainModelUrl,
  230. CkptName: req.CkptName,
  231. }
  232. err = models.CreateCloudbrain(task)
  233. if err != nil {
  234. return "", err
  235. }
  236. stringId := strconv.FormatInt(task.ID, 10)
  237. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
  238. return jobResult.ID, nil
  239. }
  240. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  241. createTime := timeutil.TimeStampNow()
  242. var jobResult *models.CreateTrainJobResult
  243. var createErr error
  244. if req.EngineID < 0 {
  245. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  246. JobName: req.JobName,
  247. Description: req.Description,
  248. Config: models.UserImageConfig{
  249. WorkServerNum: req.WorkServerNumber,
  250. AppUrl: req.CodeObsPath,
  251. BootFileUrl: req.BootFileUrl,
  252. DataUrl: req.DataUrl,
  253. TrainUrl: req.TrainUrl,
  254. LogUrl: req.LogUrl,
  255. PoolID: req.PoolID,
  256. CreateVersion: true,
  257. Flavor: models.Flavor{
  258. Code: req.Spec.SourceSpecId,
  259. },
  260. Parameter: req.Parameters,
  261. UserImageUrl: req.UserImageUrl,
  262. UserCommand: req.UserCommand,
  263. },
  264. })
  265. } else {
  266. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  267. JobName: req.JobName,
  268. Description: req.Description,
  269. Config: models.Config{
  270. WorkServerNum: req.WorkServerNumber,
  271. AppUrl: req.CodeObsPath,
  272. BootFileUrl: req.BootFileUrl,
  273. DataUrl: req.DataUrl,
  274. EngineID: req.EngineID,
  275. TrainUrl: req.TrainUrl,
  276. LogUrl: req.LogUrl,
  277. PoolID: req.PoolID,
  278. CreateVersion: true,
  279. Flavor: models.Flavor{
  280. Code: req.Spec.SourceSpecId,
  281. },
  282. Parameter: req.Parameters,
  283. },
  284. })
  285. }
  286. if createErr != nil {
  287. log.Error("createTrainJob failed: %v", createErr.Error())
  288. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  289. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  290. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  291. JobID: models.TempJobId,
  292. VersionID: models.TempVersionId,
  293. Status: models.TempJobStatus,
  294. Type: models.TypeCloudBrainTwo,
  295. JobName: req.JobName,
  296. JobType: string(models.JobTypeTrain),
  297. })
  298. if errTemp != nil {
  299. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  300. return "", errTemp
  301. }
  302. }
  303. return "", createErr
  304. }
  305. jobID := strconv.FormatInt(jobResult.JobID, 10)
  306. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  307. Status: TransTrainJobStatus(jobResult.Status),
  308. UserID: ctx.User.ID,
  309. RepoID: ctx.Repo.Repository.ID,
  310. JobID: jobID,
  311. JobName: req.JobName,
  312. DisplayJobName: req.DisplayJobName,
  313. JobType: string(models.JobTypeTrain),
  314. Type: models.TypeCloudBrainTwo,
  315. VersionID: jobResult.VersionID,
  316. VersionName: jobResult.VersionName,
  317. Uuid: req.Uuid,
  318. DatasetName: req.DatasetName,
  319. CommitID: req.CommitID,
  320. IsLatestVersion: req.IsLatestVersion,
  321. ComputeResource: models.NPUResource,
  322. EngineID: req.EngineID,
  323. TrainUrl: req.TrainUrl,
  324. BranchName: req.BranchName,
  325. Parameters: req.Params,
  326. BootFile: req.BootFile,
  327. DataUrl: req.DataUrl,
  328. LogUrl: req.LogUrl,
  329. FlavorCode: req.Spec.SourceSpecId,
  330. Description: req.Description,
  331. WorkServerNumber: req.WorkServerNumber,
  332. FlavorName: req.FlavorName,
  333. EngineName: req.EngineName,
  334. VersionCount: req.VersionCount,
  335. TotalVersionCount: req.TotalVersionCount,
  336. CreatedUnix: createTime,
  337. UpdatedUnix: createTime,
  338. Spec: req.Spec,
  339. ModelName: req.ModelName,
  340. ModelVersion: req.ModelVersion,
  341. LabelName: req.LabelName,
  342. PreTrainModelUrl: req.PreTrainModelUrl,
  343. CkptName: req.CkptName,
  344. })
  345. if createErr != nil {
  346. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  347. return "", createErr
  348. }
  349. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  350. return jobID, nil
  351. }
  352. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  353. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  354. JobName: req.JobName,
  355. Description: req.Description,
  356. Config: models.UserImageConfig{
  357. WorkServerNum: req.WorkServerNumber,
  358. AppUrl: req.CodeObsPath,
  359. BootFileUrl: req.BootFileUrl,
  360. DataUrl: req.DataUrl,
  361. TrainUrl: req.TrainUrl,
  362. LogUrl: req.LogUrl,
  363. PoolID: req.PoolID,
  364. CreateVersion: true,
  365. Flavor: models.Flavor{
  366. Code: req.FlavorCode,
  367. },
  368. Parameter: req.Parameters,
  369. UserImageUrl: req.UserImageUrl,
  370. UserCommand: req.UserCommand,
  371. },
  372. })
  373. }
  374. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  375. createTime := timeutil.TimeStampNow()
  376. var jobResult *models.CreateTrainJobResult
  377. var createErr error
  378. if req.EngineID < 0 {
  379. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  380. Description: req.Description,
  381. Config: models.TrainJobVersionUserImageConfig{
  382. WorkServerNum: req.WorkServerNumber,
  383. AppUrl: req.CodeObsPath,
  384. BootFileUrl: req.BootFileUrl,
  385. DataUrl: req.DataUrl,
  386. TrainUrl: req.TrainUrl,
  387. LogUrl: req.LogUrl,
  388. PoolID: req.PoolID,
  389. Flavor: models.Flavor{
  390. Code: req.Spec.SourceSpecId,
  391. },
  392. Parameter: req.Parameters,
  393. PreVersionId: req.PreVersionId,
  394. UserImageUrl: req.UserImageUrl,
  395. UserCommand: req.UserCommand,
  396. },
  397. }, jobId)
  398. } else {
  399. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  400. Description: req.Description,
  401. Config: models.TrainJobVersionConfig{
  402. WorkServerNum: req.WorkServerNumber,
  403. AppUrl: req.CodeObsPath,
  404. BootFileUrl: req.BootFileUrl,
  405. DataUrl: req.DataUrl,
  406. EngineID: req.EngineID,
  407. TrainUrl: req.TrainUrl,
  408. LogUrl: req.LogUrl,
  409. PoolID: req.PoolID,
  410. Flavor: models.Flavor{
  411. Code: req.Spec.SourceSpecId,
  412. },
  413. Parameter: req.Parameters,
  414. PreVersionId: req.PreVersionId,
  415. },
  416. }, jobId)
  417. }
  418. if createErr != nil {
  419. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  420. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  421. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  422. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  423. JobID: jobId,
  424. VersionID: models.TempVersionId,
  425. Status: models.TempJobStatus,
  426. Type: models.TypeCloudBrainTwo,
  427. JobName: req.JobName,
  428. JobType: string(models.JobTypeTrain),
  429. })
  430. if errTemp != nil {
  431. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  432. return errTemp
  433. }
  434. }
  435. return createErr
  436. }
  437. var jobTypes []string
  438. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  439. repo := ctx.Repo.Repository
  440. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  441. RepoID: repo.ID,
  442. Type: models.TypeCloudBrainTwo,
  443. JobTypes: jobTypes,
  444. JobID: strconv.FormatInt(jobResult.JobID, 10),
  445. })
  446. if createErr != nil {
  447. ctx.ServerError("Cloudbrain", createErr)
  448. return createErr
  449. }
  450. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  451. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  452. Status: TransTrainJobStatus(jobResult.Status),
  453. UserID: ctx.User.ID,
  454. RepoID: ctx.Repo.Repository.ID,
  455. JobID: strconv.FormatInt(jobResult.JobID, 10),
  456. JobName: req.JobName,
  457. DisplayJobName: req.DisplayJobName,
  458. JobType: string(models.JobTypeTrain),
  459. Type: models.TypeCloudBrainTwo,
  460. VersionID: jobResult.VersionID,
  461. VersionName: jobResult.VersionName,
  462. Uuid: req.Uuid,
  463. DatasetName: req.DatasetName,
  464. CommitID: req.CommitID,
  465. IsLatestVersion: req.IsLatestVersion,
  466. PreVersionName: req.PreVersionName,
  467. ComputeResource: models.NPUResource,
  468. EngineID: req.EngineID,
  469. TrainUrl: req.TrainUrl,
  470. BranchName: req.BranchName,
  471. Parameters: req.Params,
  472. BootFile: req.BootFile,
  473. DataUrl: req.DataUrl,
  474. LogUrl: req.LogUrl,
  475. PreVersionId: req.PreVersionId,
  476. FlavorCode: req.Spec.SourceSpecId,
  477. Description: req.Description,
  478. WorkServerNumber: req.WorkServerNumber,
  479. FlavorName: req.FlavorName,
  480. EngineName: req.EngineName,
  481. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  482. VersionCount: VersionListCount + 1,
  483. CreatedUnix: createTime,
  484. UpdatedUnix: createTime,
  485. Spec: req.Spec,
  486. ModelName: req.ModelName,
  487. ModelVersion: req.ModelVersion,
  488. LabelName: req.LabelName,
  489. PreTrainModelUrl: req.PreTrainModelUrl,
  490. CkptName: req.CkptName,
  491. })
  492. if createErr != nil {
  493. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  494. return createErr
  495. }
  496. //将训练任务的上一版本的isLatestVersion设置为"0"
  497. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  498. if createErr != nil {
  499. ctx.ServerError("Update IsLatestVersion failed", createErr)
  500. return createErr
  501. }
  502. return createErr
  503. }
  504. func TransTrainJobStatus(status int) string {
  505. switch status {
  506. case 0:
  507. return "UNKNOWN"
  508. case 1:
  509. return "INIT"
  510. case 2:
  511. return "IMAGE_CREATING"
  512. case 3:
  513. return "IMAGE_FAILED"
  514. case 4:
  515. return "SUBMIT_TRYING"
  516. case 5:
  517. return "SUBMIT_FAILED"
  518. case 6:
  519. return "DELETE_FAILED"
  520. case 7:
  521. return "WAITING"
  522. case 8:
  523. return "RUNNING"
  524. case 9:
  525. return "KILLING"
  526. case 10:
  527. return "COMPLETED"
  528. case 11:
  529. return "FAILED"
  530. case 12:
  531. return "KILLED"
  532. case 13:
  533. return "CANCELED"
  534. case 14:
  535. return "LOST"
  536. case 15:
  537. return "SCALING"
  538. case 16:
  539. return "SUBMIT_MODEL_FAILED"
  540. case 17:
  541. return "DEPLOY_SERVICE_FAILED"
  542. case 18:
  543. return "CHECK_INIT"
  544. case 19:
  545. return "CHECK_RUNNING"
  546. case 20:
  547. return "CHECK_RUNNING_COMPLETED"
  548. case 21:
  549. return "CHECK_FAILED"
  550. default:
  551. return strconv.Itoa(status)
  552. }
  553. }
  554. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  555. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  556. VersionOutputPath = "V" + talVersionCountToString
  557. return VersionOutputPath
  558. }
  559. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  560. createTime := timeutil.TimeStampNow()
  561. var jobResult *models.CreateTrainJobResult
  562. var createErr error
  563. if req.EngineID < 0 {
  564. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  565. JobName: req.JobName,
  566. Description: req.Description,
  567. Config: models.InfUserImageConfig{
  568. WorkServerNum: req.WorkServerNumber,
  569. AppUrl: req.CodeObsPath,
  570. BootFileUrl: req.BootFileUrl,
  571. DataUrl: req.DataUrl,
  572. // TrainUrl: req.TrainUrl,
  573. LogUrl: req.LogUrl,
  574. PoolID: req.PoolID,
  575. CreateVersion: true,
  576. Flavor: models.Flavor{
  577. Code: req.Spec.SourceSpecId,
  578. },
  579. Parameter: req.Parameters,
  580. UserImageUrl: req.UserImageUrl,
  581. UserCommand: req.UserCommand,
  582. },
  583. })
  584. } else {
  585. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  586. JobName: req.JobName,
  587. Description: req.Description,
  588. InfConfig: models.InfConfig{
  589. WorkServerNum: req.WorkServerNumber,
  590. AppUrl: req.CodeObsPath,
  591. BootFileUrl: req.BootFileUrl,
  592. DataUrl: req.DataUrl,
  593. EngineID: req.EngineID,
  594. // TrainUrl: req.TrainUrl,
  595. LogUrl: req.LogUrl,
  596. PoolID: req.PoolID,
  597. CreateVersion: true,
  598. Flavor: models.Flavor{
  599. Code: req.Spec.SourceSpecId,
  600. },
  601. Parameter: req.Parameters,
  602. },
  603. })
  604. }
  605. if createErr != nil {
  606. log.Error("createInferenceJob failed: %v", err.Error())
  607. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  608. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  609. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  610. JobID: models.TempJobId,
  611. VersionID: models.TempVersionId,
  612. Status: models.TempJobStatus,
  613. Type: models.TypeCloudBrainTwo,
  614. JobName: req.JobName,
  615. JobType: req.JobType,
  616. })
  617. if err != nil {
  618. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  619. return "", err
  620. }
  621. }
  622. return "", err
  623. }
  624. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  625. // if err != nil {
  626. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  627. // return err
  628. // }
  629. jobID := strconv.FormatInt(jobResult.JobID, 10)
  630. err = models.CreateCloudbrain(&models.Cloudbrain{
  631. Status: TransTrainJobStatus(jobResult.Status),
  632. UserID: ctx.User.ID,
  633. RepoID: ctx.Repo.Repository.ID,
  634. JobID: jobID,
  635. JobName: req.JobName,
  636. DisplayJobName: req.DisplayJobName,
  637. JobType: req.JobType,
  638. Type: models.TypeCloudBrainTwo,
  639. VersionID: jobResult.VersionID,
  640. VersionName: jobResult.VersionName,
  641. Uuid: req.Uuid,
  642. DatasetName: req.DatasetName,
  643. CommitID: req.CommitID,
  644. EngineID: req.EngineID,
  645. TrainUrl: req.TrainUrl,
  646. BranchName: req.BranchName,
  647. Parameters: req.Params,
  648. BootFile: req.BootFile,
  649. DataUrl: req.DataUrl,
  650. LogUrl: req.LogUrl,
  651. FlavorCode: req.Spec.SourceSpecId,
  652. Description: req.Description,
  653. WorkServerNumber: req.WorkServerNumber,
  654. FlavorName: req.FlavorName,
  655. EngineName: req.EngineName,
  656. LabelName: req.LabelName,
  657. IsLatestVersion: req.IsLatestVersion,
  658. ComputeResource: models.NPUResource,
  659. VersionCount: req.VersionCount,
  660. TotalVersionCount: req.TotalVersionCount,
  661. ModelName: req.ModelName,
  662. ModelVersion: req.ModelVersion,
  663. CkptName: req.CkptName,
  664. ResultUrl: req.ResultUrl,
  665. CreatedUnix: createTime,
  666. UpdatedUnix: createTime,
  667. Spec: req.Spec,
  668. })
  669. if err != nil {
  670. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  671. return "", err
  672. }
  673. if req.JobType == string(models.JobTypeModelSafety) {
  674. task, err := models.GetCloudbrainByJobID(jobID)
  675. if err == nil {
  676. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  677. }
  678. } else {
  679. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  680. }
  681. return jobID, nil
  682. }
  683. func GetNotebookImageName(imageId string) (string, error) {
  684. var validImage = false
  685. var imageName = ""
  686. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  687. if imageInfo.Id == imageId {
  688. validImage = true
  689. imageName = imageInfo.Value
  690. }
  691. }
  692. if !validImage {
  693. log.Error("the image id(%s) is invalid", imageId)
  694. return imageName, errors.New("the image id is invalid")
  695. }
  696. return imageName, nil
  697. }
  698. func InitSpecialPool() {
  699. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  700. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  701. }
  702. }
  703. func InitMultiNode() {
  704. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  705. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  706. }
  707. }
  708. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  709. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  710. if err != nil {
  711. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  712. return err
  713. }
  714. if result != nil {
  715. oldStatus := task.Status
  716. task.Status = TransTrainJobStatus(result.IntStatus)
  717. task.Duration = result.Duration / 1000
  718. task.TrainJobDuration = result.TrainJobDuration
  719. if task.StartTime == 0 && result.StartTime > 0 {
  720. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  721. }
  722. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  723. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  724. task.EndTime = task.StartTime.Add(task.Duration)
  725. }
  726. task.CorrectCreateUnix()
  727. if oldStatus != task.Status {
  728. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  729. }
  730. err = models.UpdateJob(task)
  731. if err != nil {
  732. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  733. return err
  734. }
  735. }
  736. return nil
  737. }
  738. func HandleNotebookInfo(task *models.Cloudbrain) error {
  739. var result *models.GetNotebook2Result
  740. var err error
  741. if task.Type == models.TypeCloudBrainTwo {
  742. result, err = GetNotebook2(task.JobID)
  743. } else if task.Type == models.TypeCDCenter {
  744. result, err = modelarts_cd.GetNotebook(task.JobID)
  745. }
  746. if err != nil {
  747. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  748. return err
  749. }
  750. if result != nil {
  751. oldStatus := task.Status
  752. task.Status = result.Status
  753. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  754. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  755. }
  756. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  757. task.EndTime = timeutil.TimeStampNow()
  758. }
  759. task.CorrectCreateUnix()
  760. task.ComputeAndSetDuration()
  761. if oldStatus != task.Status {
  762. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  763. }
  764. if task.FlavorCode == "" {
  765. task.FlavorCode = result.Flavor
  766. }
  767. if oldStatus != task.Status && task.Status == string(models.ModelArtsRunning) && task.BootFile != "" {
  768. uploadNoteBookFile(task, result)
  769. }
  770. err = models.UpdateJob(task)
  771. if err != nil {
  772. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  773. return err
  774. }
  775. }
  776. return nil
  777. }
  778. func uploadNoteBookFile(task *models.Cloudbrain, result *models.GetNotebook2Result) {
  779. jupyterUrl := result.Url + "?token=" + result.Token
  780. cookies, xsrf := getCookiesAndCsrf(jupyterUrl)
  781. if xsrf == "" {
  782. log.Error("browser jupyterUrl failed:%v", task.DisplayJobName)
  783. } else {
  784. codePath := setting.JobPath + task.JobName + cloudbrain.CodeMountPath
  785. fileContents, err := ioutil.ReadFile(codePath + "/" + task.BootFile)
  786. if err != nil {
  787. log.Error("read jupyter file failed:%v", task.DisplayJobName, err)
  788. }
  789. base64Content := base64.StdEncoding.EncodeToString(fileContents)
  790. client := getRestyClient()
  791. uploadUrl := getJupyterBaseUrl(result.Url) + "api/contents/" + path.Base(task.BootFile)
  792. res, err := client.R().
  793. SetCookies(cookies).
  794. SetHeader("X-XSRFToken", xsrf).
  795. SetBody(map[string]interface{}{
  796. "type": "file",
  797. "format": "base64",
  798. "name": path.Base(task.BootFile),
  799. "path": path.Base(task.BootFile),
  800. "content": base64Content}).
  801. Put(uploadUrl)
  802. if err != nil {
  803. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  804. } else if res.StatusCode() != http.StatusCreated {
  805. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  806. }
  807. }
  808. }
  809. func getJupyterBaseUrl(url string) string {
  810. jupyterUrlLength := len(url)
  811. baseUrl := url[0 : jupyterUrlLength-len(path.Base(url))]
  812. return baseUrl
  813. }
  814. func getCookiesAndCsrf(jupyterUrl string) ([]*http.Cookie, string) {
  815. log.Info("jupyter url:" + jupyterUrl)
  816. var cookies []*http.Cookie
  817. const retryTimes = 10
  818. for i := 0; i < retryTimes; i++ {
  819. res, err := http.Get(jupyterUrl)
  820. if err != nil {
  821. log.Error("browser jupyterUrl failed.", err)
  822. if i == retryTimes-1 {
  823. return cookies, ""
  824. }
  825. } else {
  826. cookies = res.Cookies()
  827. xsrf := ""
  828. for _, cookie := range cookies {
  829. if cookie.Name == "_xsrf" {
  830. xsrf = cookie.Value
  831. break
  832. }
  833. }
  834. if xsrf != "" {
  835. return cookies, xsrf
  836. }
  837. }
  838. }
  839. return cookies, ""
  840. }
  841. func SyncTempStatusJob() {
  842. jobs, err := models.GetCloudBrainTempJobs()
  843. if err != nil {
  844. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  845. return
  846. }
  847. for _, temp := range jobs {
  848. log.Info("start to handle record: %s", temp.JobName)
  849. if temp.Type == models.TypeCloudBrainTwo {
  850. if temp.JobType == string(models.JobTypeDebug) {
  851. err = handleNotebook(temp)
  852. if err != nil {
  853. log.Error("handleNotebook falied:%v", err)
  854. break
  855. }
  856. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  857. _, err = models.GetCloudbrainByJobID(temp.JobID)
  858. if err != nil {
  859. //one version
  860. err = handleTrainJob(temp)
  861. if err != nil {
  862. log.Error("handleTrainJob falied:%v", err)
  863. break
  864. }
  865. } else {
  866. //multi version
  867. err = handleTrainJobMultiVersion(temp)
  868. if err != nil {
  869. log.Error("handleTrainJobMultiVersion falied:%v", err)
  870. break
  871. }
  872. }
  873. }
  874. }
  875. }
  876. return
  877. }
  878. func handleNotebook(temp *models.CloudbrainTemp) error {
  879. if temp.Status == models.TempJobStatus {
  880. err := handleTempNotebook(temp)
  881. if err != nil {
  882. log.Error("handleTempNotebook failed:%v", err)
  883. return err
  884. }
  885. } else if temp.Status == string(models.ModelArtsStopping) {
  886. res, err := GetNotebook2(temp.JobID)
  887. if err != nil {
  888. log.Error("GetNotebook2 failed:%v", err)
  889. return err
  890. }
  891. temp.Status = res.Status
  892. if temp.Status == string(models.ModelArtsStopped) {
  893. err = models.UpdateCloudbrainTemp(temp)
  894. if err != nil {
  895. log.Error("UpdateCloudbrainTemp failed:%v", err)
  896. return err
  897. }
  898. _, err := DelNotebook2(temp.JobID)
  899. if err != nil {
  900. log.Error("DelNotebook2 failed:%v", err)
  901. return err
  902. }
  903. temp.Status = string(models.ModelArtsDeleted)
  904. err = models.UpdateCloudbrainTemp(temp)
  905. if err != nil {
  906. log.Error("UpdateCloudbrainTemp failed:%v", err)
  907. return err
  908. }
  909. }
  910. }
  911. return nil
  912. }
  913. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  914. var err error
  915. var isExist bool
  916. for {
  917. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  918. if err != nil {
  919. log.Error("GetNotebookList failed:%v", err)
  920. break
  921. }
  922. temp.QueryTimes++
  923. err = models.UpdateCloudbrainTemp(temp)
  924. if err != nil {
  925. log.Error("UpdateCloudbrainTemp failed:%v", err)
  926. }
  927. if result != nil {
  928. for _, notebook := range result.NotebookList {
  929. if temp.JobID == models.TempJobId {
  930. //new notebook
  931. if notebook.JobName == temp.JobName {
  932. isExist = true
  933. temp.Status = notebook.Status
  934. temp.JobID = notebook.JobID
  935. break
  936. }
  937. } else {
  938. //restart: always can find one record
  939. if notebook.JobName == temp.JobName {
  940. if notebook.Status != string(models.ModelArtsStopped) {
  941. isExist = true
  942. temp.Status = notebook.Status
  943. temp.JobID = notebook.JobID
  944. break
  945. }
  946. }
  947. }
  948. }
  949. if isExist {
  950. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  951. if temp.Status == string(models.ModelArtsCreateFailed) {
  952. err = models.UpdateCloudbrainTemp(temp)
  953. if err != nil {
  954. log.Error("UpdateCloudbrainTemp failed:%v", err)
  955. break
  956. }
  957. _, err := DelNotebook2(temp.JobID)
  958. if err != nil {
  959. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  960. break
  961. }
  962. temp.Status = string(models.ModelArtsDeleted)
  963. } else {
  964. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  965. if err != nil {
  966. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  967. break
  968. }
  969. temp.Status = string(models.ModelArtsStopping)
  970. }
  971. models.UpdateCloudbrainTemp(temp)
  972. } else {
  973. log.Error("can not find the record(%s) till now", temp.JobName)
  974. err = errors.New("not found")
  975. break
  976. }
  977. } else {
  978. log.Error("can not find the record(%s) till now", temp.JobName)
  979. err = errors.New("not found")
  980. break
  981. }
  982. break
  983. }
  984. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  985. log.Info("reach MaxTempQueryTimes, set the job failed")
  986. temp.Status = string(models.ModelArtsTrainJobFailed)
  987. err = models.UpdateCloudbrainTemp(temp)
  988. if err != nil {
  989. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  990. return err
  991. }
  992. }
  993. return err
  994. }
  995. func handleTrainJob(temp *models.CloudbrainTemp) error {
  996. if temp.Status == models.TempJobStatus {
  997. err := handleTempTrainJob(temp)
  998. if err != nil {
  999. log.Error("handleTempTrainJob failed:%v", err)
  1000. return err
  1001. }
  1002. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1003. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1004. if err != nil {
  1005. log.Error("GetTrainJob failed:%v", err)
  1006. return err
  1007. }
  1008. temp.Status = TransTrainJobStatus(res.IntStatus)
  1009. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1010. err = models.UpdateCloudbrainTemp(temp)
  1011. if err != nil {
  1012. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1013. return err
  1014. }
  1015. _, err := DelTrainJob(temp.JobID)
  1016. if err != nil {
  1017. log.Error("DelTrainJob failed:%v", err)
  1018. return err
  1019. }
  1020. temp.Status = string(models.ModelArtsDeleted)
  1021. err = models.UpdateCloudbrainTemp(temp)
  1022. if err != nil {
  1023. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1024. return err
  1025. }
  1026. }
  1027. }
  1028. return nil
  1029. }
  1030. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1031. if temp.Status == models.TempJobStatus {
  1032. err := handleTempTrainJobMultiVersion(temp)
  1033. if err != nil {
  1034. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1035. return err
  1036. }
  1037. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1038. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1039. if err != nil {
  1040. log.Error("GetTrainJob failed:%v", err)
  1041. return err
  1042. }
  1043. temp.Status = TransTrainJobStatus(res.IntStatus)
  1044. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1045. err = models.UpdateCloudbrainTemp(temp)
  1046. if err != nil {
  1047. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1048. return err
  1049. }
  1050. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1051. if err != nil {
  1052. log.Error("DelTrainJob failed:%v", err)
  1053. return err
  1054. }
  1055. temp.Status = string(models.ModelArtsDeleted)
  1056. err = models.UpdateCloudbrainTemp(temp)
  1057. if err != nil {
  1058. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1059. return err
  1060. }
  1061. }
  1062. }
  1063. return nil
  1064. }
  1065. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1066. var err error
  1067. var isExist bool
  1068. for {
  1069. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1070. if err != nil {
  1071. log.Error("GetTrainJobVersionList failed:%v", err)
  1072. break
  1073. }
  1074. temp.QueryTimes++
  1075. err = models.UpdateCloudbrainTemp(temp)
  1076. if err != nil {
  1077. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1078. }
  1079. if result != nil {
  1080. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1081. if result.VersionCount == int64(count+1) {
  1082. isExist = true
  1083. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1084. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1085. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1086. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1087. if err != nil {
  1088. log.Error("StopTrainJob failed:%v", err)
  1089. break
  1090. }
  1091. temp.Status = string(models.ModelArtsTrainJobKilling)
  1092. err = models.UpdateCloudbrainTemp(temp)
  1093. if err != nil {
  1094. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1095. break
  1096. }
  1097. } else {
  1098. log.Error("can not find the record(%s) till now", temp.JobName)
  1099. err = errors.New("not found")
  1100. break
  1101. }
  1102. }
  1103. break
  1104. }
  1105. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1106. log.Info("reach MaxTempQueryTimes, set the job failed")
  1107. temp.Status = string(models.ModelArtsTrainJobFailed)
  1108. err = models.UpdateCloudbrainTemp(temp)
  1109. if err != nil {
  1110. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1111. return err
  1112. }
  1113. }
  1114. return err
  1115. }
  1116. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1117. var err error
  1118. var isExist bool
  1119. for {
  1120. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1121. if err != nil {
  1122. log.Error("GetTrainJobList failed:%v", err)
  1123. break
  1124. }
  1125. temp.QueryTimes++
  1126. err = models.UpdateCloudbrainTemp(temp)
  1127. if err != nil {
  1128. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1129. }
  1130. if result != nil {
  1131. for _, job := range result.JobList {
  1132. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1133. isExist = true
  1134. temp.Status = TransTrainJobStatus(job.IntStatus)
  1135. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1136. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1137. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1138. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1139. if err != nil {
  1140. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1141. break
  1142. }
  1143. temp.Status = string(models.ModelArtsTrainJobKilling)
  1144. err = models.UpdateCloudbrainTemp(temp)
  1145. if err != nil {
  1146. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1147. break
  1148. }
  1149. }
  1150. }
  1151. if !isExist {
  1152. log.Error("can not find the record(%s) till now", temp.JobName)
  1153. err = errors.New("not found")
  1154. break
  1155. }
  1156. }
  1157. break
  1158. }
  1159. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1160. log.Info("reach MaxTempQueryTimes, set the job failed")
  1161. temp.Status = string(models.ModelArtsTrainJobFailed)
  1162. err = models.UpdateCloudbrainTemp(temp)
  1163. if err != nil {
  1164. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1165. return err
  1166. }
  1167. }
  1168. return err
  1169. }