You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 27 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. package repo
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "io"
  6. "io/ioutil"
  7. "net/http"
  8. "os"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/auth"
  15. "code.gitea.io/gitea/modules/base"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/obs"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "github.com/unknwon/com"
  24. )
  25. const (
  26. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  27. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  28. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  29. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  30. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  31. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  32. tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index"
  33. )
  34. // MustEnableDataset check if repository enable internal cb
  35. func MustEnableModelArts(ctx *context.Context) {
  36. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  37. ctx.NotFound("MustEnableCloudbrain", nil)
  38. return
  39. }
  40. }
  41. func NotebookIndex(ctx *context.Context) {
  42. MustEnableModelArts(ctx)
  43. repo := ctx.Repo.Repository
  44. page := ctx.QueryInt("page")
  45. if page <= 0 {
  46. page = 1
  47. }
  48. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  49. ListOptions: models.ListOptions{
  50. Page: page,
  51. PageSize: setting.UI.IssuePagingNum,
  52. },
  53. RepoID: repo.ID,
  54. Type: models.TypeCloudBrainTwo,
  55. JobType: string(models.JobTypeDebug),
  56. })
  57. if err != nil {
  58. ctx.ServerError("Cloudbrain", err)
  59. return
  60. }
  61. for i, task := range ciTasks {
  62. if task.Status == string(models.JobRunning) {
  63. ciTasks[i].CanDebug = true
  64. } else {
  65. ciTasks[i].CanDebug = false
  66. }
  67. }
  68. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  69. pager.SetDefaultParams(ctx)
  70. ctx.Data["Page"] = pager
  71. ctx.Data["PageIsCloudBrain"] = true
  72. ctx.Data["Tasks"] = ciTasks
  73. ctx.HTML(200, tplModelArtsNotebookIndex)
  74. }
  75. func NotebookNew(ctx *context.Context) {
  76. ctx.Data["PageIsCloudBrain"] = true
  77. t := time.Now()
  78. var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  79. ctx.Data["job_name"] = jobName
  80. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  81. if err != nil {
  82. ctx.ServerError("GetAllUserAttachments failed:", err)
  83. return
  84. }
  85. ctx.Data["attachments"] = attachs
  86. ctx.Data["dataset_path"] = modelarts.DataSetMountPath
  87. ctx.Data["env"] = modelarts.NotebookEnv
  88. ctx.Data["notebook_type"] = modelarts.NotebookType
  89. if modelarts.FlavorInfos == nil {
  90. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  91. }
  92. ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
  93. ctx.HTML(200, tplModelArtsNotebookNew)
  94. }
  95. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  96. ctx.Data["PageIsNotebook"] = true
  97. jobName := form.JobName
  98. uuid := form.Attachment
  99. description := form.Description
  100. flavor := form.Flavor
  101. err := modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  102. if err != nil {
  103. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  104. return
  105. }
  106. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
  107. }
  108. func NotebookShow(ctx *context.Context) {
  109. ctx.Data["PageIsCloudBrain"] = true
  110. var jobID = ctx.Params(":jobid")
  111. task, err := models.GetCloudbrainByJobID(jobID)
  112. if err != nil {
  113. ctx.Data["error"] = err.Error()
  114. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  115. return
  116. }
  117. result, err := modelarts.GetJob(jobID)
  118. if err != nil {
  119. ctx.Data["error"] = err.Error()
  120. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  121. return
  122. }
  123. if result != nil {
  124. task.Status = result.Status
  125. err = models.UpdateJob(task)
  126. if err != nil {
  127. ctx.Data["error"] = err.Error()
  128. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  129. return
  130. }
  131. createTime, _ := com.StrTo(result.CreationTimestamp).Int64()
  132. result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05")
  133. endTime, _ := com.StrTo(result.LatestUpdateTimestamp).Int64()
  134. result.LatestUpdateTime = time.Unix(int64(endTime/1000), 0).Format("2006-01-02 15:04:05")
  135. result.QueuingInfo.BeginTime = time.Unix(int64(result.QueuingInfo.BeginTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  136. result.QueuingInfo.EndTime = time.Unix(int64(result.QueuingInfo.EndTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  137. }
  138. ctx.Data["task"] = task
  139. ctx.Data["jobID"] = jobID
  140. ctx.Data["result"] = result
  141. ctx.HTML(200, tplModelArtsNotebookShow)
  142. }
  143. func NotebookDebug(ctx *context.Context) {
  144. var jobID = ctx.Params(":jobid")
  145. _, err := models.GetCloudbrainByJobID(jobID)
  146. if err != nil {
  147. ctx.ServerError("GetCloudbrainByJobID failed", err)
  148. return
  149. }
  150. result, err := modelarts.GetJob(jobID)
  151. if err != nil {
  152. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  153. return
  154. }
  155. res, err := modelarts.GetJobToken(jobID)
  156. if err != nil {
  157. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  158. return
  159. }
  160. urls := strings.Split(result.Spec.Annotations.Url, "/")
  161. urlPrefix := result.Spec.Annotations.TargetDomain
  162. for i, url := range urls {
  163. if i > 2 {
  164. urlPrefix += "/" + url
  165. }
  166. }
  167. debugUrl := urlPrefix + "?token=" + res.Token
  168. ctx.Redirect(debugUrl)
  169. }
  170. func NotebookStop(ctx *context.Context) {
  171. var jobID = ctx.Params(":jobid")
  172. log.Info(jobID)
  173. task, err := models.GetCloudbrainByJobID(jobID)
  174. if err != nil {
  175. ctx.ServerError("GetCloudbrainByJobID failed", err)
  176. return
  177. }
  178. if task.Status != string(models.JobRunning) {
  179. log.Error("the job(%s) is not running", task.JobName)
  180. ctx.ServerError("the job is not running", errors.New("the job is not running"))
  181. return
  182. }
  183. param := models.NotebookAction{
  184. Action: models.ActionStop,
  185. }
  186. res, err := modelarts.StopJob(jobID, param)
  187. if err != nil {
  188. log.Error("StopJob(%s) failed:%v", task.JobName, err.Error())
  189. ctx.ServerError("StopJob failed", err)
  190. return
  191. }
  192. task.Status = res.CurrentStatus
  193. err = models.UpdateJob(task)
  194. if err != nil {
  195. ctx.ServerError("UpdateJob failed", err)
  196. return
  197. }
  198. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
  199. }
  200. func NotebookDel(ctx *context.Context) {
  201. var jobID = ctx.Params(":jobid")
  202. task, err := models.GetCloudbrainByJobID(jobID)
  203. if err != nil {
  204. ctx.ServerError("GetCloudbrainByJobID failed", err)
  205. return
  206. }
  207. if task.Status != string(models.JobStopped) {
  208. log.Error("the job(%s) has not been stopped", task.JobName)
  209. ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped"))
  210. return
  211. }
  212. _, err = modelarts.DelNotebook(jobID)
  213. if err != nil {
  214. log.Error("DelJob(%s) failed:%v", task.JobName, err.Error())
  215. ctx.ServerError("DelJob failed", err)
  216. return
  217. }
  218. err = models.DeleteJob(task)
  219. if err != nil {
  220. ctx.ServerError("DeleteJob failed", err)
  221. return
  222. }
  223. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/notebook")
  224. }
  225. func TrainJobIndex(ctx *context.Context) {
  226. MustEnableModelArts(ctx)
  227. //can, err := canUserCreateTrainJob(ctx.User.ID)
  228. //if err != nil {
  229. // ctx.ServerError("canUserCreateTrainJob", err)
  230. // return
  231. //}
  232. //
  233. //ctx.Data["CanCreate"] = can
  234. repo := ctx.Repo.Repository
  235. page := ctx.QueryInt("page")
  236. if page <= 0 {
  237. page = 1
  238. }
  239. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  240. ListOptions: models.ListOptions{
  241. Page: page,
  242. PageSize: setting.UI.IssuePagingNum,
  243. },
  244. RepoID: repo.ID,
  245. Type: models.TypeCloudBrainTwo,
  246. JobType: string(models.JobTypeTrain),
  247. })
  248. if err != nil {
  249. ctx.ServerError("Cloudbrain", err)
  250. return
  251. }
  252. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  253. pager.SetDefaultParams(ctx)
  254. ctx.Data["Page"] = pager
  255. ctx.Data["PageIsCloudBrain"] = true
  256. ctx.Data["Tasks"] = tasks
  257. ctx.HTML(200, tplModelArtsTrainJobIndex)
  258. }
  259. func TrainJobNew(ctx *context.Context) {
  260. err := trainJobNewDataPrepare(ctx)
  261. if err != nil {
  262. ctx.ServerError("get new train-job info failed", err)
  263. return
  264. }
  265. ctx.HTML(200, tplModelArtsTrainJobNew)
  266. }
  267. func trainJobNewDataPrepare(ctx *context.Context) error {
  268. ctx.Data["PageIsCloudBrain"] = true
  269. //can, err := canUserCreateTrainJob(ctx.User.ID)
  270. //if err != nil {
  271. // ctx.ServerError("canUserCreateTrainJob", err)
  272. // return
  273. //}
  274. //
  275. //if !can {
  276. // log.Error("the user can not create train-job")
  277. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  278. // return
  279. //}
  280. t := time.Now()
  281. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  282. ctx.Data["job_name"] = jobName
  283. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  284. if err != nil {
  285. ctx.ServerError("GetAllUserAttachments failed:", err)
  286. return err
  287. }
  288. ctx.Data["attachments"] = attachs
  289. var resourcePools modelarts.ResourcePool
  290. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  291. ctx.ServerError("json.Unmarshal failed:", err)
  292. return err
  293. }
  294. ctx.Data["resource_pools"] = resourcePools.Info
  295. var engines modelarts.Engine
  296. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  297. ctx.ServerError("json.Unmarshal failed:", err)
  298. return err
  299. }
  300. ctx.Data["engines"] = engines.Info
  301. var versionInfos modelarts.VersionInfo
  302. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  303. ctx.ServerError("json.Unmarshal failed:", err)
  304. return err
  305. }
  306. ctx.Data["engine_versions"] = versionInfos.Version
  307. var flavorInfos modelarts.Flavor
  308. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  309. ctx.ServerError("json.Unmarshal failed:", err)
  310. return err
  311. }
  312. ctx.Data["flavor_infos"] = flavorInfos.Info
  313. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  314. ctx.Data["train_url"] = outputObsPath
  315. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  316. if err != nil {
  317. ctx.ServerError("getConfigList failed:", err)
  318. return err
  319. }
  320. ctx.Data["config_list"] = configList.ParaConfigs
  321. return nil
  322. }
  323. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  324. ctx.Data["PageIsTrainJob"] = true
  325. jobName := form.JobName
  326. uuid := form.Attachment
  327. description := form.Description
  328. workServerNumber := form.WorkServerNumber
  329. engineID := form.EngineID
  330. bootFile := form.BootFile
  331. flavorCode := form.Flavor
  332. params := form.Params
  333. poolID := form.PoolID
  334. isSaveParam := form.IsSaveParam
  335. repo := ctx.Repo.Repository
  336. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  337. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  338. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  339. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath
  340. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  341. //can, err := canUserCreateTrainJob(ctx.User.ID)
  342. //if err != nil {
  343. // ctx.ServerError("canUserCreateTrainJob", err)
  344. // return
  345. //}
  346. //
  347. //if !can {
  348. // log.Error("the user can not create train-job")
  349. // ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form)
  350. // return
  351. //}
  352. //param check
  353. if err := paramCheckCreateTrainJob(form); err != nil {
  354. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  355. trainJobNewDataPrepare(ctx)
  356. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  357. return
  358. }
  359. attach, err := models.GetAttachmentByUUID(uuid)
  360. if err != nil {
  361. log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
  362. return
  363. }
  364. //todo: del the codeLocalPath
  365. _, err = ioutil.ReadDir(codeLocalPath)
  366. if err == nil {
  367. os.RemoveAll(codeLocalPath)
  368. }
  369. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil {
  370. log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
  371. trainJobNewDataPrepare(ctx)
  372. ctx.Data["bootFile"] = form.BootFile
  373. ctx.Data["uuid"] = form.Attachment
  374. ctx.Data["datasetName"] = attach.Name
  375. ctx.Data["params"] = form.Params
  376. trainJobNewDataPrepare(ctx)
  377. // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
  378. ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form)
  379. // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form)
  380. return
  381. }
  382. //todo: upload code (send to file_server todo this work?)
  383. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  384. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  385. trainJobNewDataPrepare(ctx)
  386. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  387. return
  388. }
  389. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil {
  390. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  391. trainJobNewDataPrepare(ctx)
  392. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  393. return
  394. }
  395. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  396. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  397. trainJobNewDataPrepare(ctx)
  398. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
  399. return
  400. }
  401. //todo: del local code?
  402. var parameters models.Parameters
  403. param := make([]models.Parameter, 0)
  404. param = append(param, models.Parameter{
  405. Label: modelarts.TrainUrl,
  406. Value: outputObsPath,
  407. }, models.Parameter{
  408. Label: modelarts.DataUrl,
  409. Value: dataPath,
  410. })
  411. if len(params) != 0 {
  412. err := json.Unmarshal([]byte(params), &parameters)
  413. if err != nil {
  414. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  415. trainJobNewDataPrepare(ctx)
  416. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  417. return
  418. }
  419. for _, parameter := range parameters.Parameter {
  420. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  421. param = append(param, models.Parameter{
  422. Label: parameter.Label,
  423. Value: parameter.Value,
  424. })
  425. }
  426. }
  427. }
  428. //save param config
  429. if isSaveParam == "on" {
  430. if form.ParameterTemplateName == "" {
  431. log.Error("ParameterTemplateName is empty")
  432. trainJobNewDataPrepare(ctx)
  433. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  434. return
  435. }
  436. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  437. ConfigName: form.ParameterTemplateName,
  438. Description: form.PrameterDescription,
  439. DataUrl: dataPath,
  440. AppUrl: codeObsPath,
  441. BootFileUrl: codeObsPath + bootFile,
  442. TrainUrl: outputObsPath,
  443. Flavor: models.Flavor{
  444. Code: flavorCode,
  445. },
  446. WorkServerNum: workServerNumber,
  447. EngineID: int64(engineID),
  448. LogUrl: logObsPath,
  449. PoolID: poolID,
  450. Parameter: param,
  451. })
  452. if err != nil {
  453. log.Error("Failed to CreateTrainJobConfig: %v", err)
  454. trainJobNewDataPrepare(ctx)
  455. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  456. return
  457. }
  458. }
  459. req := &modelarts.GenerateTrainJobReq{
  460. JobName: jobName,
  461. DataUrl: dataPath,
  462. Description: description,
  463. CodeObsPath: codeObsPath,
  464. BootFile: codeObsPath + bootFile,
  465. TrainUrl: outputObsPath,
  466. FlavorCode: flavorCode,
  467. WorkServerNumber: workServerNumber,
  468. EngineID: int64(engineID),
  469. LogUrl: logObsPath,
  470. PoolID: poolID,
  471. Uuid: uuid,
  472. Parameters: param,
  473. }
  474. err = modelarts.GenerateTrainJob(ctx, req)
  475. if err != nil {
  476. log.Error("GenerateTrainJob failed:%v", err.Error())
  477. trainJobNewDataPrepare(ctx)
  478. ctx.Data["bootFile"] = form.BootFile
  479. ctx.Data["uuid"] = form.Attachment
  480. ctx.Data["datasetName"] = attach.Name
  481. ctx.Data["params"] = form.Params
  482. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  483. return
  484. }
  485. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  486. }
  487. // readDir reads the directory named by dirname and returns
  488. // a list of directory entries sorted by filename.
  489. func readDir(dirname string) ([]os.FileInfo, error) {
  490. f, err := os.Open(dirname)
  491. if err != nil {
  492. return nil, err
  493. }
  494. list, err := f.Readdir(100)
  495. f.Close()
  496. if err != nil {
  497. //todo: can not upload empty folder
  498. if err == io.EOF {
  499. return nil, nil
  500. }
  501. return nil, err
  502. }
  503. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  504. return list, nil
  505. }
  506. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  507. files, err := readDir(codePath)
  508. if err != nil {
  509. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  510. return err
  511. }
  512. for _, file := range files {
  513. if file.IsDir() {
  514. input := &obs.PutObjectInput{}
  515. input.Bucket = setting.Bucket
  516. input.Key = parentDir + file.Name() + "/"
  517. _, err = storage.ObsCli.PutObject(input)
  518. if err != nil {
  519. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  520. return err
  521. }
  522. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  523. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  524. return err
  525. }
  526. } else {
  527. input := &obs.PutFileInput{}
  528. input.Bucket = setting.Bucket
  529. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  530. input.SourceFile = codePath + file.Name()
  531. _, err = storage.ObsCli.PutFile(input)
  532. if err != nil {
  533. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  534. return err
  535. }
  536. }
  537. }
  538. return nil
  539. }
  540. func obsMkdir(dir string) error {
  541. input := &obs.PutObjectInput{}
  542. input.Bucket = setting.Bucket
  543. input.Key = dir
  544. _, err := storage.ObsCli.PutObject(input)
  545. if err != nil {
  546. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  547. return err
  548. }
  549. return nil
  550. }
  551. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  552. if !strings.HasSuffix(form.BootFile, ".py") {
  553. log.Error("the boot file(%s) must be a python file", form.BootFile)
  554. return errors.New("启动文件必须是python文件")
  555. }
  556. if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
  557. log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
  558. return errors.New("计算节点数必须在1-25之间")
  559. }
  560. return nil
  561. }
  562. func TrainJobShow(ctx *context.Context) {
  563. ctx.Data["PageIsCloudBrain"] = true
  564. var jobID = ctx.Params(":jobid")
  565. task, err := models.GetCloudbrainByJobID(jobID)
  566. if err != nil {
  567. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  568. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  569. return
  570. }
  571. // attach, err := models.GetAttachmentByUUID(task.Uuid)
  572. // if err != nil {
  573. // log.Error("GetAttachmentByUUID(%s) failed:%v", jobID, err.Error())
  574. // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  575. // return
  576. // }
  577. result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  578. if err != nil {
  579. log.Error("GetJob(%s) failed:%v", jobID, err.Error())
  580. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  581. return
  582. }
  583. if result != nil {
  584. result.CreateTime = time.Unix(int64(result.LongCreateTime/1000), 0).Format("2006-01-02 15:04:05")
  585. if result.Duration != 0 {
  586. result.TrainJobDuration = addZero(result.Duration/3600000) + ":" + addZero(result.Duration%3600000/60000) + ":" + addZero(result.Duration%60000/1000)
  587. } else {
  588. result.TrainJobDuration = "00:00:00"
  589. }
  590. result.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  591. err = models.SetTrainJobStatusByJobID(jobID, result.Status, result.Duration, string(result.TrainJobDuration))
  592. if err != nil {
  593. ctx.ServerError("UpdateJob failed", err)
  594. return
  595. }
  596. result.DatasetName = task.DatasetName
  597. }
  598. resultLogFile, resultLog, err := trainJobGetLog(jobID)
  599. if err != nil {
  600. log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
  601. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  602. return
  603. }
  604. ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]
  605. ctx.Data["log"] = resultLog
  606. ctx.Data["task"] = task
  607. ctx.Data["jobID"] = jobID
  608. ctx.Data["result"] = result
  609. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  610. }
  611. func addZero(t int64) (m string) {
  612. if t < 10 {
  613. m = "0" + strconv.FormatInt(t, 10)
  614. return m
  615. } else {
  616. return strconv.FormatInt(t, 10)
  617. }
  618. }
  619. func TrainJobGetLog(ctx *context.Context) {
  620. ctx.Data["PageIsTrainJob"] = true
  621. var jobID = ctx.Params(":jobid")
  622. var logFileName = ctx.Query("file_name")
  623. var baseLine = ctx.Query("base_line")
  624. var order = ctx.Query("order")
  625. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  626. log.Error("order(%s) check failed", order)
  627. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  628. return
  629. }
  630. task, err := models.GetCloudbrainByJobID(jobID)
  631. if err != nil {
  632. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  633. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  634. return
  635. }
  636. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  637. if err != nil {
  638. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  639. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  640. return
  641. }
  642. ctx.Data["log"] = result
  643. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  644. }
  645. func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  646. task, err := models.GetCloudbrainByJobID(jobID)
  647. if err != nil {
  648. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  649. return nil, nil, err
  650. }
  651. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  652. if err != nil {
  653. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  654. return nil, nil, err
  655. }
  656. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
  657. if err != nil {
  658. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  659. return nil, nil, err
  660. }
  661. return resultLogFile, result, err
  662. }
  663. func TrainJobDel(ctx *context.Context) {
  664. var jobID = ctx.Params(":jobid")
  665. task, err := models.GetCloudbrainByJobID(jobID)
  666. if err != nil {
  667. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  668. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  669. return
  670. }
  671. _, err = modelarts.DelTrainJob(jobID)
  672. if err != nil {
  673. log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error())
  674. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  675. return
  676. }
  677. err = models.DeleteJob(task)
  678. if err != nil {
  679. ctx.ServerError("DeleteJob failed", err)
  680. return
  681. }
  682. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  683. }
  684. func TrainJobStop(ctx *context.Context) {
  685. var jobID = ctx.Params(":jobid")
  686. task, err := models.GetCloudbrainByJobID(jobID)
  687. if err != nil {
  688. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  689. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  690. return
  691. }
  692. _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  693. if err != nil {
  694. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  695. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  696. return
  697. }
  698. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  699. }
  700. func canUserCreateTrainJob(uid int64) (bool, error) {
  701. org, err := models.GetOrgByName(setting.AllowedOrg)
  702. if err != nil {
  703. log.Error("get allowed org failed: ", setting.AllowedOrg)
  704. return false, err
  705. }
  706. return org.IsOrgMember(uid)
  707. }
  708. func TrainJobGetConfigList(ctx *context.Context) {
  709. ctx.Data["PageIsTrainJob"] = true
  710. var jobID = ctx.Params(":jobid")
  711. var logFileName = ctx.Query("file_name")
  712. var baseLine = ctx.Query("base_line")
  713. var order = ctx.Query("order")
  714. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  715. log.Error("order(%s) check failed", order)
  716. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  717. return
  718. }
  719. task, err := models.GetCloudbrainByJobID(jobID)
  720. if err != nil {
  721. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  722. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  723. return
  724. }
  725. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  726. if err != nil {
  727. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  728. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  729. return
  730. }
  731. ctx.Data["log"] = result
  732. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  733. }
  734. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  735. var result models.GetConfigListResult
  736. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  737. if err != nil {
  738. log.Error("GetConfigList failed:", err)
  739. return &result, err
  740. }
  741. for _, config := range list.ParaConfigs {
  742. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  743. if err != nil {
  744. log.Error("GetParaConfig failed:", err)
  745. return &result, err
  746. }
  747. config.Result = paraConfig
  748. }
  749. return list, nil
  750. }
  751. func TrainJobShowModels(ctx *context.Context) {
  752. ctx.Data["PageIsCloudBrain"] = true
  753. jobID := ctx.Params(":jobid")
  754. parentDir := ctx.Query("parentDir")
  755. dirArray := strings.Split(parentDir, "/")
  756. task, err := models.GetCloudbrainByJobID(jobID)
  757. if err != nil {
  758. log.Error("no such job!", ctx.Data["msgID"])
  759. ctx.ServerError("no such job:", err)
  760. return
  761. }
  762. models, err := storage.GetObsListObject(task.JobName, parentDir)
  763. if err != nil {
  764. log.Info("get TrainJobListModel failed:", err)
  765. ctx.ServerError("GetObsListObject:", err)
  766. return
  767. }
  768. ctx.Data["Path"] = dirArray
  769. ctx.Data["Dirs"] = models
  770. ctx.Data["task"] = task
  771. ctx.Data["JobID"] = jobID
  772. ctx.HTML(200, tplModelArtsTrainJobShowModels)
  773. }
  774. func TrainJobDownloadModel(ctx *context.Context) {
  775. parentDir := ctx.Query("parentDir")
  776. fileName := ctx.Query("fileName")
  777. jobName := ctx.Query("jobName")
  778. url, err := storage.GetObsCreateSignedUrl(jobName, parentDir, fileName)
  779. if err != nil {
  780. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  781. ctx.ServerError("GetObsCreateSignedUrl", err)
  782. return
  783. }
  784. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  785. }