You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 12 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "path"
  5. "strconv"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/context"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/setting"
  10. "code.gitea.io/gitea/modules/storage"
  11. )
  12. const (
  13. //notebook
  14. storageTypeOBS = "obs"
  15. autoStopDuration = 4 * 60 * 60
  16. DataSetMountPath = "/home/ma-user/work"
  17. NotebookEnv = "Python3"
  18. NotebookType = "Ascend"
  19. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  20. //train-job
  21. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  22. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  23. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  24. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  25. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  26. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  27. // "]}"
  28. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  29. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  30. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  31. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  32. // "]}"
  33. CodePath = "/code/"
  34. OutputPath = "/output/"
  35. LogPath = "/log/"
  36. JobPath = "/job/"
  37. OrderDesc = "desc" //向下查询
  38. OrderAsc = "asc" //向上查询
  39. Lines = 20
  40. TrainUrl = "train_url"
  41. DataUrl = "data_url"
  42. PerPage = 10
  43. IsLatestVersion = "1"
  44. NotLatestVersion = "0"
  45. ComputeResource = "NPU"
  46. InitFatherVersionName = "V0001"
  47. VersionCount = 1
  48. SortByCreateTime = "create_time"
  49. ConfigTypeCustom = "custom"
  50. )
  51. var (
  52. poolInfos *models.PoolInfos
  53. FlavorInfos *models.FlavorInfos
  54. )
  55. type GenerateTrainJobReq struct {
  56. JobName string
  57. Uuid string
  58. Description string
  59. CodeObsPath string
  60. BootFile string
  61. BootFileUrl string
  62. DataUrl string
  63. TrainUrl string
  64. FlavorCode string
  65. LogUrl string
  66. PoolID string
  67. WorkServerNumber int
  68. EngineID int64
  69. Parameters []models.Parameter
  70. CommitID string
  71. IsLatestVersion string
  72. Params string
  73. BranchName string
  74. FatherVersionName string
  75. FlavorName string
  76. VersionCount int
  77. }
  78. type GenerateTrainJobVersionReq struct {
  79. JobName string
  80. Uuid string
  81. Description string
  82. CodeObsPath string
  83. BootFile string
  84. BootFileUrl string
  85. DataUrl string
  86. TrainUrl string
  87. FlavorCode string
  88. LogUrl string
  89. PoolID string
  90. WorkServerNumber int
  91. EngineID int64
  92. Parameters []models.Parameter
  93. Params string
  94. PreVersionId int64
  95. CommitID string
  96. BranchName string
  97. FlavorName string
  98. }
  99. type VersionInfo struct {
  100. Version []struct {
  101. ID int `json:"id"`
  102. Value string `json:"value"`
  103. } `json:"version"`
  104. }
  105. type Flavor struct {
  106. Info []struct {
  107. Code string `json:"code"`
  108. Value string `json:"value"`
  109. } `json:"flavor"`
  110. }
  111. type Engine struct {
  112. Info []struct {
  113. ID int `json:"id"`
  114. Value string `json:"value"`
  115. } `json:"engine"`
  116. }
  117. type ResourcePool struct {
  118. Info []struct {
  119. ID string `json:"id"`
  120. Value string `json:"value"`
  121. } `json:"resource_pool"`
  122. }
  123. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  124. var dataActualPath string
  125. if uuid != "" {
  126. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  127. } else {
  128. userPath := setting.UserBasePath + ctx.User.Name + "/"
  129. isExist, err := storage.ObsHasObject(userPath)
  130. if err != nil {
  131. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  132. return err
  133. }
  134. if !isExist {
  135. if err = storage.ObsCreateObject(userPath); err != nil {
  136. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  137. return err
  138. }
  139. }
  140. dataActualPath = setting.Bucket + "/" + userPath
  141. }
  142. if poolInfos == nil {
  143. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  144. }
  145. jobResult, err := CreateJob(models.CreateNotebookParams{
  146. JobName: jobName,
  147. Description: description,
  148. ProfileID: setting.ProfileID,
  149. Flavor: flavor,
  150. Pool: models.Pool{
  151. ID: poolInfos.PoolInfo[0].PoolId,
  152. Name: poolInfos.PoolInfo[0].PoolName,
  153. Type: poolInfos.PoolInfo[0].PoolType,
  154. },
  155. Spec: models.Spec{
  156. Storage: models.Storage{
  157. Type: storageTypeOBS,
  158. Location: models.Location{
  159. Path: dataActualPath,
  160. },
  161. },
  162. AutoStop: models.AutoStop{
  163. Enable: true,
  164. Duration: autoStopDuration,
  165. },
  166. },
  167. })
  168. if err != nil {
  169. log.Error("CreateJob failed: %v", err.Error())
  170. return err
  171. }
  172. err = models.CreateCloudbrain(&models.Cloudbrain{
  173. Status: string(models.JobWaiting),
  174. UserID: ctx.User.ID,
  175. RepoID: ctx.Repo.Repository.ID,
  176. JobID: jobResult.ID,
  177. JobName: jobName,
  178. JobType: string(models.JobTypeDebug),
  179. Type: models.TypeCloudBrainTwo,
  180. Uuid: uuid,
  181. })
  182. if err != nil {
  183. return err
  184. }
  185. return nil
  186. }
  187. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  188. jobResult, err := createTrainJob(models.CreateTrainJobParams{
  189. JobName: req.JobName,
  190. Description: req.Description,
  191. Config: models.Config{
  192. WorkServerNum: req.WorkServerNumber,
  193. AppUrl: req.CodeObsPath,
  194. BootFileUrl: req.BootFileUrl,
  195. DataUrl: req.DataUrl,
  196. EngineID: req.EngineID,
  197. TrainUrl: req.TrainUrl,
  198. LogUrl: req.LogUrl,
  199. PoolID: req.PoolID,
  200. CreateVersion: true,
  201. Flavor: models.Flavor{
  202. Code: req.FlavorCode,
  203. },
  204. Parameter: req.Parameters,
  205. },
  206. })
  207. if err != nil {
  208. log.Error("CreateJob failed: %v", err.Error())
  209. return err
  210. }
  211. attach, err := models.GetAttachmentByUUID(req.Uuid)
  212. if err != nil {
  213. log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  214. return err
  215. }
  216. err = models.CreateCloudbrain(&models.Cloudbrain{
  217. Status: TransTrainJobStatus(jobResult.Status),
  218. UserID: ctx.User.ID,
  219. RepoID: ctx.Repo.Repository.ID,
  220. JobID: strconv.FormatInt(jobResult.JobID, 10),
  221. JobName: req.JobName,
  222. JobType: string(models.JobTypeTrain),
  223. Type: models.TypeCloudBrainTwo,
  224. VersionID: jobResult.VersionID,
  225. VersionName: jobResult.VersionName,
  226. Uuid: req.Uuid,
  227. DatasetName: attach.Name,
  228. CommitID: req.CommitID,
  229. IsLatestVersion: req.IsLatestVersion,
  230. ComputeResource: ComputeResource,
  231. EngineID: req.EngineID,
  232. FatherVersionName: req.FatherVersionName,
  233. TrainUrl: req.TrainUrl,
  234. BranchName: req.BranchName,
  235. Parameters: req.Params,
  236. BootFile: req.BootFile,
  237. DataUrl: req.DataUrl,
  238. LogUrl: req.LogUrl,
  239. FlavorCode: req.FlavorCode,
  240. Description: req.Description,
  241. WorkServerNumber: req.WorkServerNumber,
  242. FlavorName: req.FlavorName,
  243. VersionCount: req.VersionCount,
  244. })
  245. if err != nil {
  246. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  247. return err
  248. }
  249. return nil
  250. }
  251. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (err error) {
  252. jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{
  253. Description: req.Description,
  254. Config: models.TrainJobVersionConfig{
  255. WorkServerNum: req.WorkServerNumber,
  256. AppUrl: req.CodeObsPath,
  257. BootFileUrl: req.BootFileUrl,
  258. DataUrl: req.DataUrl,
  259. EngineID: req.EngineID,
  260. TrainUrl: req.TrainUrl,
  261. LogUrl: req.LogUrl,
  262. PoolID: req.PoolID,
  263. Flavor: models.Flavor{
  264. Code: req.FlavorCode,
  265. },
  266. Parameter: req.Parameters,
  267. PreVersionId: req.PreVersionId,
  268. },
  269. }, jobId)
  270. if err != nil {
  271. log.Error("CreateJob failed: %v", err.Error())
  272. return err
  273. }
  274. attach, err := models.GetAttachmentByUUID(req.Uuid)
  275. if err != nil {
  276. log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  277. return err
  278. }
  279. err = models.CreateCloudbrain(&models.Cloudbrain{
  280. Status: TransTrainJobStatus(jobResult.Status),
  281. UserID: ctx.User.ID,
  282. RepoID: ctx.Repo.Repository.ID,
  283. JobID: strconv.FormatInt(jobResult.JobID, 10),
  284. JobName: req.JobName,
  285. JobType: string(models.JobTypeTrain),
  286. Type: models.TypeCloudBrainTwo,
  287. VersionID: jobResult.VersionID,
  288. VersionName: jobResult.VersionName,
  289. Uuid: req.Uuid,
  290. DatasetName: attach.Name,
  291. CommitID: req.CommitID,
  292. FatherVersionName: fatherVersionName,
  293. ComputeResource: ComputeResource,
  294. EngineID: req.EngineID,
  295. TrainUrl: req.TrainUrl,
  296. BranchName: req.BranchName,
  297. Parameters: req.Params,
  298. BootFile: req.BootFile,
  299. DataUrl: req.DataUrl,
  300. LogUrl: req.LogUrl,
  301. PreVersionId: req.PreVersionId,
  302. FlavorCode: req.FlavorCode,
  303. Description: req.Description,
  304. WorkServerNumber: req.WorkServerNumber,
  305. FlavorName: req.FlavorName,
  306. })
  307. if err != nil {
  308. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  309. return err
  310. }
  311. repo := ctx.Repo.Repository
  312. page := ctx.QueryInt("page")
  313. if page <= 0 {
  314. page = 1
  315. }
  316. _, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  317. ListOptions: models.ListOptions{
  318. Page: page,
  319. PageSize: setting.UI.IssuePagingNum,
  320. },
  321. RepoID: repo.ID,
  322. Type: models.TypeCloudBrainTwo,
  323. JobType: string(models.JobTypeTrain),
  324. JobID: strconv.FormatInt(jobResult.JobID, 10),
  325. })
  326. if err != nil {
  327. ctx.ServerError("Cloudbrain", err)
  328. return err
  329. }
  330. //将训练任务的上一版本的isLatestVersion设置为"0"
  331. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion)
  332. if err != nil {
  333. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  334. return err
  335. }
  336. err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion)
  337. if err != nil {
  338. ctx.ServerError("UpdateJobVersionCount failed", err)
  339. return err
  340. }
  341. //将当前版本的isLatestVersion设置为"1"和任务数量更新
  342. err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion)
  343. if err != nil {
  344. ctx.ServerError("UpdateJobVersionCount failed", err)
  345. return err
  346. }
  347. return err
  348. }
  349. func TransTrainJobStatus(status int) string {
  350. switch status {
  351. case 0:
  352. return "UNKNOWN"
  353. case 1:
  354. return "INIT"
  355. case 2:
  356. return "IMAGE_CREATING"
  357. case 3:
  358. return "IMAGE_FAILED"
  359. case 4:
  360. return "SUBMIT_TRYING"
  361. case 5:
  362. return "SUBMIT_FAILED"
  363. case 6:
  364. return "DELETE_FAILED"
  365. case 7:
  366. return "WAITING"
  367. case 8:
  368. return "RUNNING"
  369. case 9:
  370. return "KILLING"
  371. case 10:
  372. return "COMPLETED"
  373. case 11:
  374. return "FAILED"
  375. case 12:
  376. return "KILLED"
  377. case 13:
  378. return "CANCELED"
  379. case 14:
  380. return "LOST"
  381. case 15:
  382. return "SCALING"
  383. case 16:
  384. return "SUBMIT_MODEL_FAILED"
  385. case 17:
  386. return "DEPLOY_SERVICE_FAILED"
  387. case 18:
  388. return "CHECK_INIT"
  389. case 19:
  390. return "CHECK_RUNNING"
  391. case 20:
  392. return "CHECK_RUNNING_COMPLETED"
  393. case 21:
  394. return "CHECK_FAILED"
  395. default:
  396. return strconv.Itoa(status)
  397. }
  398. return ""
  399. }