You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 40 kB

4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383
  1. package modelarts
  2. import (
  3. "encoding/base64"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "github.com/go-resty/resty/v2"
  13. "code.gitea.io/gitea/modules/cloudbrain"
  14. "code.gitea.io/gitea/modules/modelarts_cd"
  15. "code.gitea.io/gitea/models"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/log"
  18. "code.gitea.io/gitea/modules/notification"
  19. "code.gitea.io/gitea/modules/setting"
  20. "code.gitea.io/gitea/modules/storage"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. )
  23. const (
  24. //notebook
  25. storageTypeOBS = "obs"
  26. autoStopDuration = 4 * 60 * 60
  27. autoStopDurationMs = 4 * 60 * 60 * 1000
  28. MORDELART_USER_IMAGE_ENGINE_ID = -1
  29. DataSetMountPath = "/home/ma-user/work"
  30. NotebookEnv = "Python3"
  31. NotebookType = "Ascend"
  32. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  33. //train-job
  34. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  35. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  36. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  37. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  38. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  39. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  40. // "]}"
  41. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  42. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  43. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  44. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  45. // "]}"
  46. CodePath = "/code/"
  47. OutputPath = "/output/"
  48. ResultPath = "/result/"
  49. LogPath = "/log/"
  50. JobPath = "/job/"
  51. OrderDesc = "desc" //向下查询
  52. OrderAsc = "asc" //向上查询
  53. Lines = 500
  54. TrainUrl = "train_url"
  55. DataUrl = "data_url"
  56. MultiDataUrl = "multi_data_url"
  57. ResultUrl = "result_url"
  58. CkptUrl = "ckpt_url"
  59. DeviceTarget = "device_target"
  60. Ascend = "Ascend"
  61. PerPage = 10
  62. IsLatestVersion = "1"
  63. NotLatestVersion = "0"
  64. VersionCountOne = 1
  65. SortByCreateTime = "create_time"
  66. ConfigTypeCustom = "custom"
  67. TotalVersionCount = 1
  68. )
  69. var (
  70. poolInfos *models.PoolInfos
  71. TrainFlavorInfos *Flavor
  72. SpecialPools *models.SpecialPools
  73. MultiNodeConfig *MultiNodes
  74. )
  75. type GenerateTrainJobReq struct {
  76. JobName string
  77. DisplayJobName string
  78. Uuid string
  79. Description string
  80. CodeObsPath string
  81. BootFile string
  82. BootFileUrl string
  83. DataUrl string
  84. TrainUrl string
  85. LogUrl string
  86. PoolID string
  87. WorkServerNumber int
  88. EngineID int64
  89. Parameters []models.Parameter
  90. CommitID string
  91. IsLatestVersion string
  92. Params string
  93. BranchName string
  94. PreVersionId int64
  95. PreVersionName string
  96. FlavorCode string
  97. FlavorName string
  98. VersionCount int
  99. EngineName string
  100. TotalVersionCount int
  101. UserImageUrl string
  102. UserCommand string
  103. DatasetName string
  104. Spec *models.Specification
  105. ModelName string
  106. LabelName string
  107. CkptName string
  108. ModelVersion string
  109. PreTrainModelUrl string
  110. }
  111. type GenerateInferenceJobReq struct {
  112. JobName string
  113. DisplayJobName string
  114. Uuid string
  115. Description string
  116. CodeObsPath string
  117. BootFile string
  118. BootFileUrl string
  119. DataUrl string
  120. TrainUrl string
  121. LogUrl string
  122. PoolID string
  123. WorkServerNumber int
  124. EngineID int64
  125. Parameters []models.Parameter
  126. CommitID string
  127. Params string
  128. BranchName string
  129. FlavorName string
  130. EngineName string
  131. LabelName string
  132. IsLatestVersion string
  133. VersionCount int
  134. TotalVersionCount int
  135. ModelName string
  136. ModelVersion string
  137. CkptName string
  138. ResultUrl string
  139. Spec *models.Specification
  140. DatasetName string
  141. JobType string
  142. UserImageUrl string
  143. UserCommand string
  144. }
  145. type VersionInfo struct {
  146. Version []struct {
  147. ID int `json:"id"`
  148. Value string `json:"value"`
  149. Url string `json:"url"`
  150. } `json:"version"`
  151. }
  152. type Flavor struct {
  153. Info []struct {
  154. Code string `json:"code"`
  155. Value string `json:"value"`
  156. UnitPrice int64 `json:"unitPrice"`
  157. } `json:"flavor"`
  158. }
  159. type Engine struct {
  160. Info []struct {
  161. ID int `json:"id"`
  162. Value string `json:"value"`
  163. } `json:"engine"`
  164. }
  165. type ResourcePool struct {
  166. Info []struct {
  167. ID string `json:"id"`
  168. Value string `json:"value"`
  169. } `json:"resource_pool"`
  170. }
  171. type MultiNodes struct {
  172. Info []OrgMultiNode `json:"multinode"`
  173. }
  174. type OrgMultiNode struct {
  175. Org string `json:"org"`
  176. Node []int `json:"node"`
  177. }
  178. // type Parameter struct {
  179. // Label string `json:"label"`
  180. // Value string `json:"value"`
  181. // }
  182. // type Parameters struct {
  183. // Parameter []Parameter `json:"parameter"`
  184. // }
  185. type Parameters struct {
  186. Parameter []struct {
  187. Label string `json:"label"`
  188. Value string `json:"value"`
  189. } `json:"parameter"`
  190. }
  191. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  192. var dataActualPath string
  193. if uuid != "" {
  194. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  195. } else {
  196. userPath := setting.UserBasePath + ctx.User.Name + "/"
  197. isExist, err := storage.ObsHasObject(userPath)
  198. if err != nil {
  199. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  200. return err
  201. }
  202. if !isExist {
  203. if err = storage.ObsCreateObject(userPath); err != nil {
  204. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  205. return err
  206. }
  207. }
  208. dataActualPath = setting.Bucket + "/" + userPath
  209. }
  210. if poolInfos == nil {
  211. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  212. }
  213. createTime := timeutil.TimeStampNow()
  214. jobResult, err := CreateJob(models.CreateNotebookParams{
  215. JobName: jobName,
  216. Description: description,
  217. ProfileID: setting.ProfileID,
  218. Flavor: flavor,
  219. Pool: models.Pool{
  220. ID: poolInfos.PoolInfo[0].PoolId,
  221. Name: poolInfos.PoolInfo[0].PoolName,
  222. Type: poolInfos.PoolInfo[0].PoolType,
  223. },
  224. Spec: models.Spec{
  225. Storage: models.Storage{
  226. Type: storageTypeOBS,
  227. Location: models.Location{
  228. Path: dataActualPath,
  229. },
  230. },
  231. AutoStop: models.AutoStop{
  232. Enable: true,
  233. Duration: autoStopDuration,
  234. },
  235. },
  236. })
  237. if err != nil {
  238. log.Error("CreateJob failed: %v", err.Error())
  239. return err
  240. }
  241. err = models.CreateCloudbrain(&models.Cloudbrain{
  242. Status: string(models.JobWaiting),
  243. UserID: ctx.User.ID,
  244. RepoID: ctx.Repo.Repository.ID,
  245. JobID: jobResult.ID,
  246. JobName: jobName,
  247. JobType: string(models.JobTypeDebug),
  248. Type: models.TypeCloudBrainTwo,
  249. Uuid: uuid,
  250. ComputeResource: models.NPUResource,
  251. CreatedUnix: createTime,
  252. UpdatedUnix: createTime,
  253. })
  254. if err != nil {
  255. return err
  256. }
  257. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  258. return nil
  259. }
  260. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification, bootFile string) (string, error) {
  261. if poolInfos == nil {
  262. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  263. }
  264. imageName, err := GetNotebookImageName(imageId)
  265. if err != nil {
  266. log.Error("GetNotebookImageName failed: %v", err.Error())
  267. return "", err
  268. }
  269. createTime := timeutil.TimeStampNow()
  270. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  271. JobName: jobName,
  272. Description: description,
  273. Flavor: spec.SourceSpecId,
  274. Duration: autoStopDurationMs,
  275. ImageID: imageId,
  276. PoolID: poolInfos.PoolInfo[0].PoolId,
  277. Feature: models.NotebookFeature,
  278. Volume: models.VolumeReq{
  279. Capacity: setting.Capacity,
  280. Category: models.EVSCategory,
  281. Ownership: models.ManagedOwnership,
  282. },
  283. WorkspaceID: "0",
  284. })
  285. if err != nil {
  286. log.Error("createNotebook2 failed: %v", err.Error())
  287. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  288. log.Info("(%s)unknown error, set temp status", displayJobName)
  289. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  290. JobID: models.TempJobId,
  291. VersionID: models.TempVersionId,
  292. Status: models.TempJobStatus,
  293. Type: models.TypeCloudBrainTwo,
  294. JobName: jobName,
  295. JobType: string(models.JobTypeDebug),
  296. })
  297. if errTemp != nil {
  298. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  299. return "", errTemp
  300. }
  301. }
  302. return "", err
  303. }
  304. task := &models.Cloudbrain{
  305. Status: jobResult.Status,
  306. UserID: ctx.User.ID,
  307. RepoID: ctx.Repo.Repository.ID,
  308. JobID: jobResult.ID,
  309. JobName: jobName,
  310. FlavorCode: spec.SourceSpecId,
  311. DisplayJobName: displayJobName,
  312. JobType: string(models.JobTypeDebug),
  313. Type: models.TypeCloudBrainTwo,
  314. Uuid: uuid,
  315. ComputeResource: models.NPUResource,
  316. Image: imageName,
  317. BootFile: bootFile,
  318. Description: description,
  319. CreatedUnix: createTime,
  320. UpdatedUnix: createTime,
  321. Spec: spec,
  322. }
  323. err = models.CreateCloudbrain(task)
  324. if err != nil {
  325. return "", err
  326. }
  327. stringId := strconv.FormatInt(task.ID, 10)
  328. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  329. return jobResult.ID, nil
  330. }
  331. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  332. createTime := timeutil.TimeStampNow()
  333. var jobResult *models.CreateTrainJobResult
  334. var createErr error
  335. if req.EngineID < 0 {
  336. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  337. JobName: req.JobName,
  338. Description: req.Description,
  339. Config: models.UserImageConfig{
  340. WorkServerNum: req.WorkServerNumber,
  341. AppUrl: req.CodeObsPath,
  342. BootFileUrl: req.BootFileUrl,
  343. DataUrl: req.DataUrl,
  344. TrainUrl: req.TrainUrl,
  345. LogUrl: req.LogUrl,
  346. PoolID: req.PoolID,
  347. CreateVersion: true,
  348. Flavor: models.Flavor{
  349. Code: req.Spec.SourceSpecId,
  350. },
  351. Parameter: req.Parameters,
  352. UserImageUrl: req.UserImageUrl,
  353. UserCommand: req.UserCommand,
  354. },
  355. })
  356. } else {
  357. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  358. JobName: req.JobName,
  359. Description: req.Description,
  360. Config: models.Config{
  361. WorkServerNum: req.WorkServerNumber,
  362. AppUrl: req.CodeObsPath,
  363. BootFileUrl: req.BootFileUrl,
  364. DataUrl: req.DataUrl,
  365. EngineID: req.EngineID,
  366. TrainUrl: req.TrainUrl,
  367. LogUrl: req.LogUrl,
  368. PoolID: req.PoolID,
  369. CreateVersion: true,
  370. Flavor: models.Flavor{
  371. Code: req.Spec.SourceSpecId,
  372. },
  373. Parameter: req.Parameters,
  374. },
  375. })
  376. }
  377. if createErr != nil {
  378. log.Error("createTrainJob failed: %v", createErr.Error())
  379. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  380. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  381. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  382. JobID: models.TempJobId,
  383. VersionID: models.TempVersionId,
  384. Status: models.TempJobStatus,
  385. Type: models.TypeCloudBrainTwo,
  386. JobName: req.JobName,
  387. JobType: string(models.JobTypeTrain),
  388. })
  389. if errTemp != nil {
  390. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  391. return "", errTemp
  392. }
  393. }
  394. return "", createErr
  395. }
  396. jobID := strconv.FormatInt(jobResult.JobID, 10)
  397. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  398. Status: TransTrainJobStatus(jobResult.Status),
  399. UserID: ctx.User.ID,
  400. RepoID: ctx.Repo.Repository.ID,
  401. JobID: jobID,
  402. JobName: req.JobName,
  403. DisplayJobName: req.DisplayJobName,
  404. JobType: string(models.JobTypeTrain),
  405. Type: models.TypeCloudBrainTwo,
  406. VersionID: jobResult.VersionID,
  407. VersionName: jobResult.VersionName,
  408. Uuid: req.Uuid,
  409. DatasetName: req.DatasetName,
  410. CommitID: req.CommitID,
  411. IsLatestVersion: req.IsLatestVersion,
  412. ComputeResource: models.NPUResource,
  413. EngineID: req.EngineID,
  414. TrainUrl: req.TrainUrl,
  415. BranchName: req.BranchName,
  416. Parameters: req.Params,
  417. BootFile: req.BootFile,
  418. DataUrl: req.DataUrl,
  419. LogUrl: req.LogUrl,
  420. FlavorCode: req.Spec.SourceSpecId,
  421. Description: req.Description,
  422. WorkServerNumber: req.WorkServerNumber,
  423. FlavorName: req.FlavorName,
  424. EngineName: req.EngineName,
  425. VersionCount: req.VersionCount,
  426. TotalVersionCount: req.TotalVersionCount,
  427. CreatedUnix: createTime,
  428. UpdatedUnix: createTime,
  429. Spec: req.Spec,
  430. ModelName: req.ModelName,
  431. ModelVersion: req.ModelVersion,
  432. LabelName: req.LabelName,
  433. PreTrainModelUrl: req.PreTrainModelUrl,
  434. CkptName: req.CkptName,
  435. })
  436. if createErr != nil {
  437. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  438. return "", createErr
  439. }
  440. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  441. return jobID, nil
  442. }
  443. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  444. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  445. JobName: req.JobName,
  446. Description: req.Description,
  447. Config: models.UserImageConfig{
  448. WorkServerNum: req.WorkServerNumber,
  449. AppUrl: req.CodeObsPath,
  450. BootFileUrl: req.BootFileUrl,
  451. DataUrl: req.DataUrl,
  452. TrainUrl: req.TrainUrl,
  453. LogUrl: req.LogUrl,
  454. PoolID: req.PoolID,
  455. CreateVersion: true,
  456. Flavor: models.Flavor{
  457. Code: req.FlavorCode,
  458. },
  459. Parameter: req.Parameters,
  460. UserImageUrl: req.UserImageUrl,
  461. UserCommand: req.UserCommand,
  462. },
  463. })
  464. }
  465. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  466. createTime := timeutil.TimeStampNow()
  467. var jobResult *models.CreateTrainJobResult
  468. var createErr error
  469. if req.EngineID < 0 {
  470. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  471. Description: req.Description,
  472. Config: models.TrainJobVersionUserImageConfig{
  473. WorkServerNum: req.WorkServerNumber,
  474. AppUrl: req.CodeObsPath,
  475. BootFileUrl: req.BootFileUrl,
  476. DataUrl: req.DataUrl,
  477. TrainUrl: req.TrainUrl,
  478. LogUrl: req.LogUrl,
  479. PoolID: req.PoolID,
  480. Flavor: models.Flavor{
  481. Code: req.Spec.SourceSpecId,
  482. },
  483. Parameter: req.Parameters,
  484. PreVersionId: req.PreVersionId,
  485. UserImageUrl: req.UserImageUrl,
  486. UserCommand: req.UserCommand,
  487. },
  488. }, jobId)
  489. } else {
  490. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  491. Description: req.Description,
  492. Config: models.TrainJobVersionConfig{
  493. WorkServerNum: req.WorkServerNumber,
  494. AppUrl: req.CodeObsPath,
  495. BootFileUrl: req.BootFileUrl,
  496. DataUrl: req.DataUrl,
  497. EngineID: req.EngineID,
  498. TrainUrl: req.TrainUrl,
  499. LogUrl: req.LogUrl,
  500. PoolID: req.PoolID,
  501. Flavor: models.Flavor{
  502. Code: req.Spec.SourceSpecId,
  503. },
  504. Parameter: req.Parameters,
  505. PreVersionId: req.PreVersionId,
  506. },
  507. }, jobId)
  508. }
  509. if createErr != nil {
  510. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  511. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  512. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  513. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  514. JobID: jobId,
  515. VersionID: models.TempVersionId,
  516. Status: models.TempJobStatus,
  517. Type: models.TypeCloudBrainTwo,
  518. JobName: req.JobName,
  519. JobType: string(models.JobTypeTrain),
  520. })
  521. if errTemp != nil {
  522. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  523. return errTemp
  524. }
  525. }
  526. return createErr
  527. }
  528. var jobTypes []string
  529. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  530. repo := ctx.Repo.Repository
  531. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  532. RepoID: repo.ID,
  533. Type: models.TypeCloudBrainTwo,
  534. JobTypes: jobTypes,
  535. JobID: strconv.FormatInt(jobResult.JobID, 10),
  536. })
  537. if createErr != nil {
  538. ctx.ServerError("Cloudbrain", createErr)
  539. return createErr
  540. }
  541. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  542. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  543. Status: TransTrainJobStatus(jobResult.Status),
  544. UserID: ctx.User.ID,
  545. RepoID: ctx.Repo.Repository.ID,
  546. JobID: strconv.FormatInt(jobResult.JobID, 10),
  547. JobName: req.JobName,
  548. DisplayJobName: req.DisplayJobName,
  549. JobType: string(models.JobTypeTrain),
  550. Type: models.TypeCloudBrainTwo,
  551. VersionID: jobResult.VersionID,
  552. VersionName: jobResult.VersionName,
  553. Uuid: req.Uuid,
  554. DatasetName: req.DatasetName,
  555. CommitID: req.CommitID,
  556. IsLatestVersion: req.IsLatestVersion,
  557. PreVersionName: req.PreVersionName,
  558. ComputeResource: models.NPUResource,
  559. EngineID: req.EngineID,
  560. TrainUrl: req.TrainUrl,
  561. BranchName: req.BranchName,
  562. Parameters: req.Params,
  563. BootFile: req.BootFile,
  564. DataUrl: req.DataUrl,
  565. LogUrl: req.LogUrl,
  566. PreVersionId: req.PreVersionId,
  567. FlavorCode: req.Spec.SourceSpecId,
  568. Description: req.Description,
  569. WorkServerNumber: req.WorkServerNumber,
  570. FlavorName: req.FlavorName,
  571. EngineName: req.EngineName,
  572. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  573. VersionCount: VersionListCount + 1,
  574. CreatedUnix: createTime,
  575. UpdatedUnix: createTime,
  576. Spec: req.Spec,
  577. ModelName: req.ModelName,
  578. ModelVersion: req.ModelVersion,
  579. LabelName: req.LabelName,
  580. PreTrainModelUrl: req.PreTrainModelUrl,
  581. CkptName: req.CkptName,
  582. })
  583. if createErr != nil {
  584. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  585. return createErr
  586. }
  587. //将训练任务的上一版本的isLatestVersion设置为"0"
  588. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  589. if createErr != nil {
  590. ctx.ServerError("Update IsLatestVersion failed", createErr)
  591. return createErr
  592. }
  593. return createErr
  594. }
  595. func TransTrainJobStatus(status int) string {
  596. switch status {
  597. case 0:
  598. return "UNKNOWN"
  599. case 1:
  600. return "INIT"
  601. case 2:
  602. return "IMAGE_CREATING"
  603. case 3:
  604. return "IMAGE_FAILED"
  605. case 4:
  606. return "SUBMIT_TRYING"
  607. case 5:
  608. return "SUBMIT_FAILED"
  609. case 6:
  610. return "DELETE_FAILED"
  611. case 7:
  612. return "WAITING"
  613. case 8:
  614. return "RUNNING"
  615. case 9:
  616. return "KILLING"
  617. case 10:
  618. return "COMPLETED"
  619. case 11:
  620. return "FAILED"
  621. case 12:
  622. return "KILLED"
  623. case 13:
  624. return "CANCELED"
  625. case 14:
  626. return "LOST"
  627. case 15:
  628. return "SCALING"
  629. case 16:
  630. return "SUBMIT_MODEL_FAILED"
  631. case 17:
  632. return "DEPLOY_SERVICE_FAILED"
  633. case 18:
  634. return "CHECK_INIT"
  635. case 19:
  636. return "CHECK_RUNNING"
  637. case 20:
  638. return "CHECK_RUNNING_COMPLETED"
  639. case 21:
  640. return "CHECK_FAILED"
  641. default:
  642. return strconv.Itoa(status)
  643. }
  644. }
  645. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  646. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  647. VersionOutputPath = "V" + talVersionCountToString
  648. return VersionOutputPath
  649. }
  650. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  651. createTime := timeutil.TimeStampNow()
  652. var jobResult *models.CreateTrainJobResult
  653. var createErr error
  654. if req.EngineID < 0 {
  655. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  656. JobName: req.JobName,
  657. Description: req.Description,
  658. Config: models.InfUserImageConfig{
  659. WorkServerNum: req.WorkServerNumber,
  660. AppUrl: req.CodeObsPath,
  661. BootFileUrl: req.BootFileUrl,
  662. DataUrl: req.DataUrl,
  663. // TrainUrl: req.TrainUrl,
  664. LogUrl: req.LogUrl,
  665. PoolID: req.PoolID,
  666. CreateVersion: true,
  667. Flavor: models.Flavor{
  668. Code: req.Spec.SourceSpecId,
  669. },
  670. Parameter: req.Parameters,
  671. UserImageUrl: req.UserImageUrl,
  672. UserCommand: req.UserCommand,
  673. },
  674. })
  675. } else {
  676. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  677. JobName: req.JobName,
  678. Description: req.Description,
  679. InfConfig: models.InfConfig{
  680. WorkServerNum: req.WorkServerNumber,
  681. AppUrl: req.CodeObsPath,
  682. BootFileUrl: req.BootFileUrl,
  683. DataUrl: req.DataUrl,
  684. EngineID: req.EngineID,
  685. // TrainUrl: req.TrainUrl,
  686. LogUrl: req.LogUrl,
  687. PoolID: req.PoolID,
  688. CreateVersion: true,
  689. Flavor: models.Flavor{
  690. Code: req.Spec.SourceSpecId,
  691. },
  692. Parameter: req.Parameters,
  693. },
  694. })
  695. }
  696. if createErr != nil {
  697. log.Error("createInferenceJob failed: %v", err.Error())
  698. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  699. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  700. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  701. JobID: models.TempJobId,
  702. VersionID: models.TempVersionId,
  703. Status: models.TempJobStatus,
  704. Type: models.TypeCloudBrainTwo,
  705. JobName: req.JobName,
  706. JobType: req.JobType,
  707. })
  708. if err != nil {
  709. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  710. return "", err
  711. }
  712. }
  713. return "", err
  714. }
  715. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  716. // if err != nil {
  717. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  718. // return err
  719. // }
  720. jobID := strconv.FormatInt(jobResult.JobID, 10)
  721. err = models.CreateCloudbrain(&models.Cloudbrain{
  722. Status: TransTrainJobStatus(jobResult.Status),
  723. UserID: ctx.User.ID,
  724. RepoID: ctx.Repo.Repository.ID,
  725. JobID: jobID,
  726. JobName: req.JobName,
  727. DisplayJobName: req.DisplayJobName,
  728. JobType: req.JobType,
  729. Type: models.TypeCloudBrainTwo,
  730. VersionID: jobResult.VersionID,
  731. VersionName: jobResult.VersionName,
  732. Uuid: req.Uuid,
  733. DatasetName: req.DatasetName,
  734. CommitID: req.CommitID,
  735. EngineID: req.EngineID,
  736. TrainUrl: req.TrainUrl,
  737. BranchName: req.BranchName,
  738. Parameters: req.Params,
  739. BootFile: req.BootFile,
  740. DataUrl: req.DataUrl,
  741. LogUrl: req.LogUrl,
  742. FlavorCode: req.Spec.SourceSpecId,
  743. Description: req.Description,
  744. WorkServerNumber: req.WorkServerNumber,
  745. FlavorName: req.FlavorName,
  746. EngineName: req.EngineName,
  747. LabelName: req.LabelName,
  748. IsLatestVersion: req.IsLatestVersion,
  749. ComputeResource: models.NPUResource,
  750. VersionCount: req.VersionCount,
  751. TotalVersionCount: req.TotalVersionCount,
  752. ModelName: req.ModelName,
  753. ModelVersion: req.ModelVersion,
  754. CkptName: req.CkptName,
  755. ResultUrl: req.ResultUrl,
  756. CreatedUnix: createTime,
  757. UpdatedUnix: createTime,
  758. Spec: req.Spec,
  759. })
  760. if err != nil {
  761. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  762. return "", err
  763. }
  764. if req.JobType == string(models.JobTypeModelSafety) {
  765. task, err := models.GetCloudbrainByJobID(jobID)
  766. if err == nil {
  767. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  768. }
  769. } else {
  770. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  771. }
  772. return jobID, nil
  773. }
  774. func GetNotebookImageName(imageId string) (string, error) {
  775. var validImage = false
  776. var imageName = ""
  777. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  778. if imageInfo.Id == imageId {
  779. validImage = true
  780. imageName = imageInfo.Value
  781. }
  782. }
  783. if !validImage {
  784. log.Error("the image id(%s) is invalid", imageId)
  785. return imageName, errors.New("the image id is invalid")
  786. }
  787. return imageName, nil
  788. }
  789. func InitSpecialPool() {
  790. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  791. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  792. }
  793. }
  794. func InitMultiNode() {
  795. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  796. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  797. }
  798. }
  799. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  800. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  801. if err != nil {
  802. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  803. return err
  804. }
  805. if result != nil {
  806. oldStatus := task.Status
  807. task.Status = TransTrainJobStatus(result.IntStatus)
  808. task.Duration = result.Duration / 1000
  809. task.TrainJobDuration = result.TrainJobDuration
  810. if task.StartTime == 0 && result.StartTime > 0 {
  811. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  812. }
  813. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  814. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  815. task.EndTime = task.StartTime.Add(task.Duration)
  816. }
  817. task.CorrectCreateUnix()
  818. if oldStatus != task.Status {
  819. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  820. }
  821. err = models.UpdateJob(task)
  822. if err != nil {
  823. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  824. return err
  825. }
  826. }
  827. return nil
  828. }
  829. func HandleNotebookInfo(task *models.Cloudbrain) error {
  830. var result *models.GetNotebook2Result
  831. var err error
  832. if task.Type == models.TypeCloudBrainTwo {
  833. result, err = GetNotebook2(task.JobID)
  834. } else if task.Type == models.TypeCDCenter {
  835. result, err = modelarts_cd.GetNotebook(task.JobID)
  836. }
  837. if err != nil {
  838. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  839. return err
  840. }
  841. if result != nil {
  842. oldStatus := task.Status
  843. task.Status = result.Status
  844. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  845. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  846. }
  847. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  848. task.EndTime = timeutil.TimeStampNow()
  849. }
  850. task.CorrectCreateUnix()
  851. task.ComputeAndSetDuration()
  852. if oldStatus != task.Status {
  853. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  854. }
  855. if task.FlavorCode == "" {
  856. task.FlavorCode = result.Flavor
  857. }
  858. if oldStatus != task.Status && task.Status == string(models.ModelArtsRunning) && task.BootFile != "" {
  859. uploadNoteBookFile(task, result)
  860. }
  861. err = models.UpdateJob(task)
  862. if err != nil {
  863. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  864. return err
  865. }
  866. }
  867. return nil
  868. }
  869. func uploadNoteBookFile(task *models.Cloudbrain, result *models.GetNotebook2Result) {
  870. jupyterUrl := result.Url + "?token=" + result.Token
  871. client := getRestyClient()
  872. cookies, xsrf := getCookiesAndCsrf(client, jupyterUrl)
  873. if xsrf == "" {
  874. log.Error("browser jupyterUrl failed:%v", task.DisplayJobName)
  875. } else {
  876. codePath := setting.JobPath + task.JobName + cloudbrain.CodeMountPath
  877. fileContents, err := ioutil.ReadFile(codePath + "/" + task.BootFile)
  878. if err != nil {
  879. log.Error("read jupyter file failed:%v", task.DisplayJobName, err)
  880. }
  881. base64Content := base64.StdEncoding.EncodeToString(fileContents)
  882. uploadUrl := getJupyterBaseUrl(result.Url) + "api/contents/" + path.Base(task.BootFile)
  883. res, err := client.R().
  884. SetCookies(cookies).
  885. SetHeader("X-XSRFToken", xsrf).
  886. SetBody(map[string]interface{}{
  887. "type": "file",
  888. "format": "base64",
  889. "name": path.Base(task.BootFile),
  890. "path": path.Base(task.BootFile),
  891. "content": base64Content}).
  892. Put(uploadUrl)
  893. if err != nil {
  894. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  895. } else if res.StatusCode() != http.StatusCreated {
  896. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  897. }
  898. }
  899. }
  900. func getJupyterBaseUrl(url string) string {
  901. jupyterUrlLength := len(url)
  902. baseUrl := url[0 : jupyterUrlLength-len(path.Base(url))]
  903. return baseUrl
  904. }
  905. func getCookiesAndCsrf(client *resty.Client, jupyterUrl string) ([]*http.Cookie, string) {
  906. var cookies []*http.Cookie
  907. for i := 0; i < 4; i++ {
  908. res, err := client.R().Get(jupyterUrl)
  909. if err != nil {
  910. log.Error("browser jupyterUrl failed.")
  911. return cookies, ""
  912. } else {
  913. cookies = res.Cookies()
  914. xsrf := ""
  915. for _, cookie := range cookies {
  916. if cookie.Name == "_xsrf" {
  917. xsrf = cookie.Value
  918. }
  919. }
  920. if xsrf != "" {
  921. return cookies, xsrf
  922. }
  923. }
  924. }
  925. return cookies, ""
  926. }
  927. func SyncTempStatusJob() {
  928. jobs, err := models.GetCloudBrainTempJobs()
  929. if err != nil {
  930. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  931. return
  932. }
  933. for _, temp := range jobs {
  934. log.Info("start to handle record: %s", temp.JobName)
  935. if temp.Type == models.TypeCloudBrainTwo {
  936. if temp.JobType == string(models.JobTypeDebug) {
  937. err = handleNotebook(temp)
  938. if err != nil {
  939. log.Error("handleNotebook falied:%v", err)
  940. break
  941. }
  942. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  943. _, err = models.GetCloudbrainByJobID(temp.JobID)
  944. if err != nil {
  945. //one version
  946. err = handleTrainJob(temp)
  947. if err != nil {
  948. log.Error("handleTrainJob falied:%v", err)
  949. break
  950. }
  951. } else {
  952. //multi version
  953. err = handleTrainJobMultiVersion(temp)
  954. if err != nil {
  955. log.Error("handleTrainJobMultiVersion falied:%v", err)
  956. break
  957. }
  958. }
  959. }
  960. }
  961. }
  962. return
  963. }
  964. func handleNotebook(temp *models.CloudbrainTemp) error {
  965. if temp.Status == models.TempJobStatus {
  966. err := handleTempNotebook(temp)
  967. if err != nil {
  968. log.Error("handleTempNotebook failed:%v", err)
  969. return err
  970. }
  971. } else if temp.Status == string(models.ModelArtsStopping) {
  972. res, err := GetNotebook2(temp.JobID)
  973. if err != nil {
  974. log.Error("GetNotebook2 failed:%v", err)
  975. return err
  976. }
  977. temp.Status = res.Status
  978. if temp.Status == string(models.ModelArtsStopped) {
  979. err = models.UpdateCloudbrainTemp(temp)
  980. if err != nil {
  981. log.Error("UpdateCloudbrainTemp failed:%v", err)
  982. return err
  983. }
  984. _, err := DelNotebook2(temp.JobID)
  985. if err != nil {
  986. log.Error("DelNotebook2 failed:%v", err)
  987. return err
  988. }
  989. temp.Status = string(models.ModelArtsDeleted)
  990. err = models.UpdateCloudbrainTemp(temp)
  991. if err != nil {
  992. log.Error("UpdateCloudbrainTemp failed:%v", err)
  993. return err
  994. }
  995. }
  996. }
  997. return nil
  998. }
  999. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  1000. var err error
  1001. var isExist bool
  1002. for {
  1003. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  1004. if err != nil {
  1005. log.Error("GetNotebookList failed:%v", err)
  1006. break
  1007. }
  1008. temp.QueryTimes++
  1009. err = models.UpdateCloudbrainTemp(temp)
  1010. if err != nil {
  1011. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1012. }
  1013. if result != nil {
  1014. for _, notebook := range result.NotebookList {
  1015. if temp.JobID == models.TempJobId {
  1016. //new notebook
  1017. if notebook.JobName == temp.JobName {
  1018. isExist = true
  1019. temp.Status = notebook.Status
  1020. temp.JobID = notebook.JobID
  1021. break
  1022. }
  1023. } else {
  1024. //restart: always can find one record
  1025. if notebook.JobName == temp.JobName {
  1026. if notebook.Status != string(models.ModelArtsStopped) {
  1027. isExist = true
  1028. temp.Status = notebook.Status
  1029. temp.JobID = notebook.JobID
  1030. break
  1031. }
  1032. }
  1033. }
  1034. }
  1035. if isExist {
  1036. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1037. if temp.Status == string(models.ModelArtsCreateFailed) {
  1038. err = models.UpdateCloudbrainTemp(temp)
  1039. if err != nil {
  1040. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1041. break
  1042. }
  1043. _, err := DelNotebook2(temp.JobID)
  1044. if err != nil {
  1045. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  1046. break
  1047. }
  1048. temp.Status = string(models.ModelArtsDeleted)
  1049. } else {
  1050. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  1051. if err != nil {
  1052. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  1053. break
  1054. }
  1055. temp.Status = string(models.ModelArtsStopping)
  1056. }
  1057. models.UpdateCloudbrainTemp(temp)
  1058. } else {
  1059. log.Error("can not find the record(%s) till now", temp.JobName)
  1060. err = errors.New("not found")
  1061. break
  1062. }
  1063. } else {
  1064. log.Error("can not find the record(%s) till now", temp.JobName)
  1065. err = errors.New("not found")
  1066. break
  1067. }
  1068. break
  1069. }
  1070. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1071. log.Info("reach MaxTempQueryTimes, set the job failed")
  1072. temp.Status = string(models.ModelArtsTrainJobFailed)
  1073. err = models.UpdateCloudbrainTemp(temp)
  1074. if err != nil {
  1075. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1076. return err
  1077. }
  1078. }
  1079. return err
  1080. }
  1081. func handleTrainJob(temp *models.CloudbrainTemp) error {
  1082. if temp.Status == models.TempJobStatus {
  1083. err := handleTempTrainJob(temp)
  1084. if err != nil {
  1085. log.Error("handleTempTrainJob failed:%v", err)
  1086. return err
  1087. }
  1088. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1089. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1090. if err != nil {
  1091. log.Error("GetTrainJob failed:%v", err)
  1092. return err
  1093. }
  1094. temp.Status = TransTrainJobStatus(res.IntStatus)
  1095. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1096. err = models.UpdateCloudbrainTemp(temp)
  1097. if err != nil {
  1098. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1099. return err
  1100. }
  1101. _, err := DelTrainJob(temp.JobID)
  1102. if err != nil {
  1103. log.Error("DelTrainJob failed:%v", err)
  1104. return err
  1105. }
  1106. temp.Status = string(models.ModelArtsDeleted)
  1107. err = models.UpdateCloudbrainTemp(temp)
  1108. if err != nil {
  1109. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1110. return err
  1111. }
  1112. }
  1113. }
  1114. return nil
  1115. }
  1116. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1117. if temp.Status == models.TempJobStatus {
  1118. err := handleTempTrainJobMultiVersion(temp)
  1119. if err != nil {
  1120. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1121. return err
  1122. }
  1123. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1124. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1125. if err != nil {
  1126. log.Error("GetTrainJob failed:%v", err)
  1127. return err
  1128. }
  1129. temp.Status = TransTrainJobStatus(res.IntStatus)
  1130. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1131. err = models.UpdateCloudbrainTemp(temp)
  1132. if err != nil {
  1133. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1134. return err
  1135. }
  1136. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1137. if err != nil {
  1138. log.Error("DelTrainJob failed:%v", err)
  1139. return err
  1140. }
  1141. temp.Status = string(models.ModelArtsDeleted)
  1142. err = models.UpdateCloudbrainTemp(temp)
  1143. if err != nil {
  1144. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1145. return err
  1146. }
  1147. }
  1148. }
  1149. return nil
  1150. }
  1151. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1152. var err error
  1153. var isExist bool
  1154. for {
  1155. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1156. if err != nil {
  1157. log.Error("GetTrainJobVersionList failed:%v", err)
  1158. break
  1159. }
  1160. temp.QueryTimes++
  1161. err = models.UpdateCloudbrainTemp(temp)
  1162. if err != nil {
  1163. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1164. }
  1165. if result != nil {
  1166. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1167. if result.VersionCount == int64(count+1) {
  1168. isExist = true
  1169. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1170. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1171. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1172. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1173. if err != nil {
  1174. log.Error("StopTrainJob failed:%v", err)
  1175. break
  1176. }
  1177. temp.Status = string(models.ModelArtsTrainJobKilling)
  1178. err = models.UpdateCloudbrainTemp(temp)
  1179. if err != nil {
  1180. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1181. break
  1182. }
  1183. } else {
  1184. log.Error("can not find the record(%s) till now", temp.JobName)
  1185. err = errors.New("not found")
  1186. break
  1187. }
  1188. }
  1189. break
  1190. }
  1191. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1192. log.Info("reach MaxTempQueryTimes, set the job failed")
  1193. temp.Status = string(models.ModelArtsTrainJobFailed)
  1194. err = models.UpdateCloudbrainTemp(temp)
  1195. if err != nil {
  1196. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1197. return err
  1198. }
  1199. }
  1200. return err
  1201. }
  1202. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1203. var err error
  1204. var isExist bool
  1205. for {
  1206. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1207. if err != nil {
  1208. log.Error("GetTrainJobList failed:%v", err)
  1209. break
  1210. }
  1211. temp.QueryTimes++
  1212. err = models.UpdateCloudbrainTemp(temp)
  1213. if err != nil {
  1214. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1215. }
  1216. if result != nil {
  1217. for _, job := range result.JobList {
  1218. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1219. isExist = true
  1220. temp.Status = TransTrainJobStatus(job.IntStatus)
  1221. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1222. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1223. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1224. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1225. if err != nil {
  1226. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1227. break
  1228. }
  1229. temp.Status = string(models.ModelArtsTrainJobKilling)
  1230. err = models.UpdateCloudbrainTemp(temp)
  1231. if err != nil {
  1232. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1233. break
  1234. }
  1235. }
  1236. }
  1237. if !isExist {
  1238. log.Error("can not find the record(%s) till now", temp.JobName)
  1239. err = errors.New("not found")
  1240. break
  1241. }
  1242. }
  1243. break
  1244. }
  1245. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1246. log.Info("reach MaxTempQueryTimes, set the job failed")
  1247. temp.Status = string(models.ModelArtsTrainJobFailed)
  1248. err = models.UpdateCloudbrainTemp(temp)
  1249. if err != nil {
  1250. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1251. return err
  1252. }
  1253. }
  1254. return err
  1255. }