You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 49 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago

  1. package repo
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "io"
  6. "net/http"
  7. "os"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "code.gitea.io/gitea/models"
  13. "code.gitea.io/gitea/modules/auth"
  14. "code.gitea.io/gitea/modules/base"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/modelarts"
  20. "code.gitea.io/gitea/modules/obs"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "github.com/unknwon/com"
  24. )
  25. const (
  26. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  27. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  28. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  29. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  30. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  31. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  32. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  33. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  34. )
  35. func DebugJobIndex(ctx *context.Context) {
  36. debugListType := ctx.Query("debugListType")
  37. ctx.Data["ListType"] = debugListType
  38. MustEnableCloudbrain(ctx)
  39. repo := ctx.Repo.Repository
  40. page := ctx.QueryInt("page")
  41. if page <= 0 {
  42. page = 1
  43. }
  44. debugType := modelarts.DebugType
  45. if debugListType == models.GPUResource {
  46. debugType = models.TypeCloudBrainOne
  47. } else if debugListType == models.NPUResource {
  48. debugType = models.TypeCloudBrainTwo
  49. }
  50. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  51. ListOptions: models.ListOptions{
  52. Page: page,
  53. PageSize: setting.UI.IssuePagingNum,
  54. },
  55. RepoID: repo.ID,
  56. Type: debugType,
  57. JobTypeNot: true,
  58. JobType: string(models.JobTypeTrain),
  59. })
  60. if err != nil {
  61. ctx.ServerError("Get debugjob faild:", err)
  62. return
  63. }
  64. for i, task := range ciTasks {
  65. ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  66. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  67. ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
  68. }
  69. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  70. pager.AddParam(ctx, "debugListType", "ListType")
  71. ctx.Data["Page"] = pager
  72. ctx.Data["PageIsCloudBrain"] = true
  73. ctx.Data["Tasks"] = ciTasks
  74. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  75. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  76. ctx.HTML(200, tplDebugJobIndex)
  77. }
  78. // MustEnableDataset check if repository enable internal cb
  79. func MustEnableModelArts(ctx *context.Context) {
  80. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  81. ctx.NotFound("MustEnableCloudbrain", nil)
  82. return
  83. }
  84. }
  85. func NotebookNew(ctx *context.Context) {
  86. ctx.Data["PageIsCloudBrain"] = true
  87. t := time.Now()
  88. var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  89. ctx.Data["job_name"] = jobName
  90. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  91. if err != nil {
  92. ctx.ServerError("GetAllUserAttachments failed:", err)
  93. return
  94. }
  95. ctx.Data["attachments"] = attachs
  96. ctx.Data["dataset_path"] = modelarts.DataSetMountPath
  97. ctx.Data["env"] = modelarts.NotebookEnv
  98. ctx.Data["notebook_type"] = modelarts.NotebookType
  99. if modelarts.FlavorInfos == nil {
  100. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  101. }
  102. ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
  103. ctx.HTML(200, tplModelArtsNotebookNew)
  104. }
  105. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  106. ctx.Data["PageIsNotebook"] = true
  107. jobName := form.JobName
  108. uuid := form.Attachment
  109. description := form.Description
  110. flavor := form.Flavor
  111. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  112. if err != nil {
  113. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  114. cloudBrainNewDataPrepare(ctx)
  115. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  116. return
  117. } else {
  118. if count >= 1 {
  119. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  120. cloudBrainNewDataPrepare(ctx)
  121. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  122. return
  123. }
  124. }
  125. err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  126. if err != nil {
  127. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  128. return
  129. }
  130. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  131. }
  132. func NotebookShow(ctx *context.Context) {
  133. ctx.Data["PageIsCloudBrain"] = true
  134. var jobID = ctx.Params(":jobid")
  135. task, err := models.GetCloudbrainByJobID(jobID)
  136. if err != nil {
  137. ctx.Data["error"] = err.Error()
  138. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  139. return
  140. }
  141. result, err := modelarts.GetJob(jobID)
  142. if err != nil {
  143. ctx.Data["error"] = err.Error()
  144. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  145. return
  146. }
  147. if result != nil {
  148. task.Status = result.Status
  149. err = models.UpdateJob(task)
  150. if err != nil {
  151. ctx.Data["error"] = err.Error()
  152. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  153. return
  154. }
  155. createTime, _ := com.StrTo(result.CreationTimestamp).Int64()
  156. result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05")
  157. endTime, _ := com.StrTo(result.LatestUpdateTimestamp).Int64()
  158. result.LatestUpdateTime = time.Unix(int64(endTime/1000), 0).Format("2006-01-02 15:04:05")
  159. result.QueuingInfo.BeginTime = time.Unix(int64(result.QueuingInfo.BeginTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  160. result.QueuingInfo.EndTime = time.Unix(int64(result.QueuingInfo.EndTimestamp/1000), 0).Format("2006-01-02 15:04:05")
  161. }
  162. ctx.Data["task"] = task
  163. ctx.Data["jobID"] = jobID
  164. ctx.Data["result"] = result
  165. ctx.HTML(200, tplModelArtsNotebookShow)
  166. }
  167. func NotebookDebug(ctx *context.Context) {
  168. var jobID = ctx.Params(":jobid")
  169. _, err := models.GetCloudbrainByJobID(jobID)
  170. if err != nil {
  171. ctx.ServerError("GetCloudbrainByJobID failed", err)
  172. return
  173. }
  174. result, err := modelarts.GetJob(jobID)
  175. if err != nil {
  176. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  177. return
  178. }
  179. res, err := modelarts.GetJobToken(jobID)
  180. if err != nil {
  181. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  182. return
  183. }
  184. urls := strings.Split(result.Spec.Annotations.Url, "/")
  185. urlPrefix := result.Spec.Annotations.TargetDomain
  186. for i, url := range urls {
  187. if i > 2 {
  188. urlPrefix += "/" + url
  189. }
  190. }
  191. debugUrl := urlPrefix + "?token=" + res.Token
  192. ctx.Redirect(debugUrl)
  193. }
  194. func NotebookManage(ctx *context.Context) {
  195. var jobID = ctx.Params(":jobid")
  196. var action = ctx.Params(":action")
  197. var resultCode = "0"
  198. var errorMsg = ""
  199. var status = ""
  200. for {
  201. task, err := models.GetCloudbrainByJobID(jobID)
  202. if err != nil {
  203. log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"])
  204. resultCode = "-1"
  205. errorMsg = "system error"
  206. break
  207. }
  208. if action == models.ActionStop {
  209. if task.Status != string(models.ModelArtsRunning) {
  210. log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
  211. resultCode = "-1"
  212. errorMsg = "the job is not running"
  213. break
  214. }
  215. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) {
  216. log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"])
  217. resultCode = "-1"
  218. errorMsg = "you have no right to stop the job"
  219. break
  220. }
  221. } else if action == models.ActionRestart {
  222. if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
  223. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  224. resultCode = "-1"
  225. errorMsg = "the job is not stopped"
  226. break
  227. }
  228. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
  229. log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
  230. resultCode = "-1"
  231. errorMsg = "you have no right to restart the job"
  232. break
  233. }
  234. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  235. if err != nil {
  236. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  237. resultCode = "-1"
  238. errorMsg = "system error"
  239. break
  240. } else {
  241. if count >= 1 {
  242. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  243. resultCode = "-1"
  244. errorMsg = "you have already a running or waiting task, can not create more"
  245. break
  246. }
  247. }
  248. action = models.ActionStart
  249. } else {
  250. log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
  251. resultCode = "-1"
  252. errorMsg = "非法操作"
  253. break
  254. }
  255. param := models.NotebookAction{
  256. Action: action,
  257. }
  258. res, err := modelarts.ManageNotebook(jobID, param)
  259. if err != nil {
  260. log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  261. resultCode = "-1"
  262. errorMsg = "启动失败"
  263. break
  264. }
  265. task.Status = res.CurrentStatus
  266. err = models.UpdateJob(task)
  267. if err != nil {
  268. log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  269. resultCode = "-1"
  270. errorMsg = "system error"
  271. break
  272. }
  273. status = task.Status
  274. break
  275. }
  276. ctx.JSON(200, map[string]string{
  277. "result_code": resultCode,
  278. "error_msg": errorMsg,
  279. "status": status,
  280. "job_id": jobID,
  281. })
  282. }
  283. func NotebookDel(ctx *context.Context) {
  284. var jobID = ctx.Params(":jobid")
  285. task, err := models.GetCloudbrainByJobID(jobID)
  286. if err != nil {
  287. ctx.ServerError("GetCloudbrainByJobID failed", err)
  288. return
  289. }
  290. if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) {
  291. log.Error("the job(%s) has not been stopped", task.JobName)
  292. ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped"))
  293. return
  294. }
  295. _, err = modelarts.DelNotebook(jobID)
  296. if err != nil {
  297. log.Error("DelJob(%s) failed:%v", task.JobName, err.Error())
  298. ctx.ServerError("DelJob failed", err)
  299. return
  300. }
  301. err = models.DeleteJob(task)
  302. if err != nil {
  303. ctx.ServerError("DeleteJob failed", err)
  304. return
  305. }
  306. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  307. }
  308. func TrainJobIndex(ctx *context.Context) {
  309. MustEnableModelArts(ctx)
  310. repo := ctx.Repo.Repository
  311. page := ctx.QueryInt("page")
  312. if page <= 0 {
  313. page = 1
  314. }
  315. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  316. ListOptions: models.ListOptions{
  317. Page: page,
  318. PageSize: setting.UI.IssuePagingNum,
  319. },
  320. RepoID: repo.ID,
  321. Type: models.TypeCloudBrainTwo,
  322. JobTypeNot: false,
  323. JobType: string(models.JobTypeTrain),
  324. IsLatestVersion: modelarts.IsLatestVersion,
  325. })
  326. if err != nil {
  327. ctx.ServerError("Cloudbrain", err)
  328. return
  329. }
  330. for i, task := range tasks {
  331. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  332. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  333. tasks[i].ComputeResource = models.NPUResource
  334. }
  335. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  336. pager.SetDefaultParams(ctx)
  337. ctx.Data["Page"] = pager
  338. ctx.Data["PageIsCloudBrain"] = true
  339. ctx.Data["Tasks"] = tasks
  340. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  341. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  342. ctx.HTML(200, tplModelArtsTrainJobIndex)
  343. }
  344. func TrainJobNew(ctx *context.Context) {
  345. err := trainJobNewDataPrepare(ctx)
  346. if err != nil {
  347. ctx.ServerError("get new train-job info failed", err)
  348. return
  349. }
  350. ctx.HTML(200, tplModelArtsTrainJobNew)
  351. }
  352. func trainJobNewDataPrepare(ctx *context.Context) error {
  353. ctx.Data["PageIsCloudBrain"] = true
  354. //can, err := canUserCreateTrainJob(ctx.User.ID)
  355. //if err != nil {
  356. // ctx.ServerError("canUserCreateTrainJob", err)
  357. // return
  358. //}
  359. //
  360. //if !can {
  361. // log.Error("the user can not create train-job")
  362. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  363. // return
  364. //}
  365. t := time.Now()
  366. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  367. ctx.Data["job_name"] = jobName
  368. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  369. if err != nil {
  370. ctx.ServerError("GetAllUserAttachments failed:", err)
  371. return err
  372. }
  373. ctx.Data["attachments"] = attachs
  374. var resourcePools modelarts.ResourcePool
  375. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  376. ctx.ServerError("json.Unmarshal failed:", err)
  377. return err
  378. }
  379. ctx.Data["resource_pools"] = resourcePools.Info
  380. var engines modelarts.Engine
  381. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  382. ctx.ServerError("json.Unmarshal failed:", err)
  383. return err
  384. }
  385. ctx.Data["engines"] = engines.Info
  386. var versionInfos modelarts.VersionInfo
  387. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  388. ctx.ServerError("json.Unmarshal failed:", err)
  389. return err
  390. }
  391. ctx.Data["engine_versions"] = versionInfos.Version
  392. var flavorInfos modelarts.Flavor
  393. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  394. ctx.ServerError("json.Unmarshal failed:", err)
  395. return err
  396. }
  397. ctx.Data["flavor_infos"] = flavorInfos.Info
  398. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  399. ctx.Data["train_url"] = outputObsPath
  400. ctx.Data["params"] = ""
  401. ctx.Data["branchName"] = ctx.Repo.BranchName
  402. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  403. if err != nil {
  404. ctx.ServerError("getConfigList failed:", err)
  405. return err
  406. }
  407. ctx.Data["config_list"] = configList.ParaConfigs
  408. return nil
  409. }
  410. func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  411. ctx.Data["PageIsCloudBrain"] = true
  412. //can, err := canUserCreateTrainJob(ctx.User.ID)
  413. //if err != nil {
  414. // ctx.ServerError("canUserCreateTrainJob", err)
  415. // return
  416. //}
  417. //
  418. //if !can {
  419. // log.Error("the user can not create train-job")
  420. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  421. // return
  422. //}
  423. t := time.Now()
  424. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  425. ctx.Data["job_name"] = jobName
  426. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  427. if err != nil {
  428. ctx.ServerError("GetAllUserAttachments failed:", err)
  429. return err
  430. }
  431. ctx.Data["attachments"] = attachs
  432. var resourcePools modelarts.ResourcePool
  433. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  434. ctx.ServerError("json.Unmarshal failed:", err)
  435. return err
  436. }
  437. ctx.Data["resource_pools"] = resourcePools.Info
  438. var engines modelarts.Engine
  439. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  440. ctx.ServerError("json.Unmarshal failed:", err)
  441. return err
  442. }
  443. ctx.Data["engines"] = engines.Info
  444. var versionInfos modelarts.VersionInfo
  445. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  446. ctx.ServerError("json.Unmarshal failed:", err)
  447. return err
  448. }
  449. ctx.Data["engine_versions"] = versionInfos.Version
  450. var flavorInfos modelarts.Flavor
  451. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  452. ctx.ServerError("json.Unmarshal failed:", err)
  453. return err
  454. }
  455. ctx.Data["flavor_infos"] = flavorInfos.Info
  456. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  457. ctx.Data["train_url"] = outputObsPath
  458. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  459. if err != nil {
  460. ctx.ServerError("getConfigList failed:", err)
  461. return err
  462. }
  463. var Parameters modelarts.Parameters
  464. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  465. ctx.ServerError("json.Unmarshal failed:", err)
  466. return err
  467. }
  468. ctx.Data["params"] = Parameters.Parameter
  469. ctx.Data["config_list"] = configList.ParaConfigs
  470. ctx.Data["bootFile"] = form.BootFile
  471. ctx.Data["uuid"] = form.Attachment
  472. ctx.Data["branch_name"] = form.BranchName
  473. return nil
  474. }
  475. func TrainJobNewVersion(ctx *context.Context) {
  476. err := trainJobNewVersionDataPrepare(ctx)
  477. if err != nil {
  478. ctx.ServerError("get new train-job info failed", err)
  479. return
  480. }
  481. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  482. }
  483. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  484. ctx.Data["PageIsCloudBrain"] = true
  485. var jobID = ctx.Params(":jobid")
  486. var versionName = ctx.Query("version_name")
  487. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  488. // if err != nil {
  489. // ctx.ServerError("canNewJob can info failed", err)
  490. // return err
  491. // }
  492. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  493. if err != nil {
  494. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  495. return err
  496. }
  497. t := time.Now()
  498. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  499. ctx.Data["job_name"] = task.JobName
  500. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  501. if err != nil {
  502. ctx.ServerError("GetAllUserAttachments failed:", err)
  503. return err
  504. }
  505. ctx.Data["attachments"] = attachs
  506. var resourcePools modelarts.ResourcePool
  507. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  508. ctx.ServerError("json.Unmarshal failed:", err)
  509. return err
  510. }
  511. ctx.Data["resource_pools"] = resourcePools.Info
  512. var engines modelarts.Engine
  513. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  514. ctx.ServerError("json.Unmarshal failed:", err)
  515. return err
  516. }
  517. ctx.Data["engines"] = engines.Info
  518. var versionInfos modelarts.VersionInfo
  519. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  520. ctx.ServerError("json.Unmarshal failed:", err)
  521. return err
  522. }
  523. ctx.Data["engine_versions"] = versionInfos.Version
  524. var flavorInfos modelarts.Flavor
  525. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  526. ctx.ServerError("json.Unmarshal failed:", err)
  527. return err
  528. }
  529. ctx.Data["flavor_infos"] = flavorInfos.Info
  530. var Parameters modelarts.Parameters
  531. if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
  532. ctx.ServerError("json.Unmarshal failed:", err)
  533. return err
  534. }
  535. ctx.Data["params"] = Parameters.Parameter
  536. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  537. ctx.Data["train_url"] = outputObsPath
  538. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  539. if err != nil {
  540. ctx.ServerError("GetBranches error:", err)
  541. return err
  542. }
  543. ctx.Data["branches"] = branches
  544. ctx.Data["branch_name"] = task.BranchName
  545. ctx.Data["description"] = task.Description
  546. ctx.Data["boot_file"] = task.BootFile
  547. ctx.Data["dataset_name"] = task.DatasetName
  548. ctx.Data["work_server_number"] = task.WorkServerNumber
  549. ctx.Data["flavor_name"] = task.FlavorName
  550. ctx.Data["engine_name"] = task.EngineName
  551. ctx.Data["uuid"] = task.Uuid
  552. ctx.Data["flavor_code"] = task.FlavorCode
  553. ctx.Data["engine_id"] = task.EngineID
  554. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  555. if err != nil {
  556. ctx.ServerError("getConfigList failed:", err)
  557. return err
  558. }
  559. ctx.Data["config_list"] = configList.ParaConfigs
  560. return nil
  561. }
  562. func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  563. ctx.Data["PageIsCloudBrain"] = true
  564. var jobID = ctx.Params(":jobid")
  565. // var versionName = ctx.Params(":version-name")
  566. var versionName = ctx.Query("version_name")
  567. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  568. if err != nil {
  569. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  570. return err
  571. }
  572. t := time.Now()
  573. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  574. ctx.Data["job_name"] = task.JobName
  575. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  576. if err != nil {
  577. ctx.ServerError("GetAllUserAttachments failed:", err)
  578. return err
  579. }
  580. ctx.Data["attachments"] = attachs
  581. var resourcePools modelarts.ResourcePool
  582. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  583. ctx.ServerError("json.Unmarshal failed:", err)
  584. return err
  585. }
  586. ctx.Data["resource_pools"] = resourcePools.Info
  587. var engines modelarts.Engine
  588. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  589. ctx.ServerError("json.Unmarshal failed:", err)
  590. return err
  591. }
  592. ctx.Data["engines"] = engines.Info
  593. var versionInfos modelarts.VersionInfo
  594. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  595. ctx.ServerError("json.Unmarshal failed:", err)
  596. return err
  597. }
  598. ctx.Data["engine_versions"] = versionInfos.Version
  599. var flavorInfos modelarts.Flavor
  600. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  601. ctx.ServerError("json.Unmarshal failed:", err)
  602. return err
  603. }
  604. ctx.Data["flavor_infos"] = flavorInfos.Info
  605. var Parameters modelarts.Parameters
  606. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  607. ctx.ServerError("json.Unmarshal failed:", err)
  608. return err
  609. }
  610. ctx.Data["params"] = Parameters.Parameter
  611. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  612. ctx.Data["train_url"] = outputObsPath
  613. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  614. if err != nil {
  615. ctx.ServerError("GetBranches error:", err)
  616. return err
  617. }
  618. ctx.Data["branches"] = branches
  619. ctx.Data["description"] = form.Description
  620. ctx.Data["dataset_name"] = task.DatasetName
  621. ctx.Data["work_server_number"] = form.WorkServerNumber
  622. ctx.Data["flavor_name"] = form.FlavorName
  623. ctx.Data["engine_name"] = form.EngineName
  624. ctx.Data["flavor_code"] = task.FlavorCode
  625. ctx.Data["engine_id"] = task.EngineID
  626. ctx.Data["version_name"] = form.VersionName
  627. ctx.Data["bootFile"] = form.BootFile
  628. ctx.Data["uuid"] = form.Attachment
  629. ctx.Data["branch_name"] = form.BranchName
  630. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  631. if err != nil {
  632. ctx.ServerError("getConfigList failed:", err)
  633. return err
  634. }
  635. ctx.Data["config_list"] = configList.ParaConfigs
  636. return nil
  637. }
  638. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  639. ctx.Data["PageIsTrainJob"] = true
  640. VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(modelarts.TotalVersionCount)
  641. jobName := form.JobName
  642. uuid := form.Attachment
  643. description := form.Description
  644. workServerNumber := form.WorkServerNumber
  645. engineID := form.EngineID
  646. bootFile := form.BootFile
  647. flavorCode := form.Flavor
  648. params := form.Params
  649. poolID := form.PoolID
  650. isSaveParam := form.IsSaveParam
  651. repo := ctx.Repo.Repository
  652. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  653. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  654. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  655. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  656. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  657. branch_name := form.BranchName
  658. isLatestVersion := modelarts.IsLatestVersion
  659. FlavorName := form.FlavorName
  660. VersionCount := modelarts.VersionCount
  661. EngineName := form.EngineName
  662. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  663. if err != nil {
  664. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  665. trainJobErrorNewDataPrepare(ctx, form)
  666. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  667. return
  668. } else {
  669. if count >= 1 {
  670. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  671. trainJobErrorNewDataPrepare(ctx, form)
  672. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  673. return
  674. }
  675. }
  676. if err := paramCheckCreateTrainJob(form); err != nil {
  677. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  678. trainJobErrorNewDataPrepare(ctx, form)
  679. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  680. return
  681. }
  682. // attach, err := models.GetAttachmentByUUID(uuid)
  683. // if err != nil {
  684. // log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
  685. // return
  686. // }
  687. //todo: del the codeLocalPath
  688. // _, err := ioutil.ReadDir(codeLocalPath)
  689. // if err == nil {
  690. // os.RemoveAll(codeLocalPath)
  691. // }
  692. os.RemoveAll(codeLocalPath)
  693. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  694. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  695. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  696. Branch: branch_name,
  697. }); err != nil {
  698. log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err)
  699. trainJobErrorNewDataPrepare(ctx, form)
  700. ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsTrainJobNew, &form)
  701. return
  702. }
  703. //todo: upload code (send to file_server todo this work?)
  704. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  705. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  706. trainJobErrorNewDataPrepare(ctx, form)
  707. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  708. return
  709. }
  710. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  711. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  712. trainJobErrorNewDataPrepare(ctx, form)
  713. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  714. return
  715. }
  716. // parentDir := VersionOutputPath + "/"
  717. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  718. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  719. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  720. trainJobErrorNewDataPrepare(ctx, form)
  721. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
  722. return
  723. }
  724. //todo: del local code?
  725. var parameters models.Parameters
  726. param := make([]models.Parameter, 0)
  727. param = append(param, models.Parameter{
  728. Label: modelarts.TrainUrl,
  729. Value: outputObsPath,
  730. }, models.Parameter{
  731. Label: modelarts.DataUrl,
  732. Value: dataPath,
  733. })
  734. if len(params) != 0 {
  735. err := json.Unmarshal([]byte(params), &parameters)
  736. if err != nil {
  737. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  738. trainJobErrorNewDataPrepare(ctx, form)
  739. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  740. return
  741. }
  742. for _, parameter := range parameters.Parameter {
  743. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  744. param = append(param, models.Parameter{
  745. Label: parameter.Label,
  746. Value: parameter.Value,
  747. })
  748. }
  749. }
  750. }
  751. //save param config
  752. if isSaveParam == "on" {
  753. if form.ParameterTemplateName == "" {
  754. log.Error("ParameterTemplateName is empty")
  755. trainJobNewDataPrepare(ctx)
  756. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  757. return
  758. }
  759. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  760. ConfigName: form.ParameterTemplateName,
  761. Description: form.PrameterDescription,
  762. DataUrl: dataPath,
  763. AppUrl: codeObsPath,
  764. BootFileUrl: codeObsPath + bootFile,
  765. TrainUrl: outputObsPath,
  766. Flavor: models.Flavor{
  767. Code: flavorCode,
  768. },
  769. WorkServerNum: workServerNumber,
  770. EngineID: int64(engineID),
  771. LogUrl: logObsPath,
  772. PoolID: poolID,
  773. Parameter: param,
  774. })
  775. if err != nil {
  776. log.Error("Failed to CreateTrainJobConfig: %v", err)
  777. trainJobErrorNewDataPrepare(ctx, form)
  778. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  779. return
  780. }
  781. }
  782. req := &modelarts.GenerateTrainJobReq{
  783. JobName: jobName,
  784. DataUrl: dataPath,
  785. Description: description,
  786. CodeObsPath: codeObsPath,
  787. BootFileUrl: codeObsPath + bootFile,
  788. BootFile: bootFile,
  789. TrainUrl: outputObsPath,
  790. FlavorCode: flavorCode,
  791. WorkServerNumber: workServerNumber,
  792. EngineID: int64(engineID),
  793. LogUrl: logObsPath,
  794. PoolID: poolID,
  795. Uuid: uuid,
  796. Parameters: parameters.Parameter,
  797. CommitID: commitID,
  798. IsLatestVersion: isLatestVersion,
  799. BranchName: branch_name,
  800. Params: form.Params,
  801. FlavorName: FlavorName,
  802. EngineName: EngineName,
  803. VersionCount: VersionCount,
  804. TotalVersionCount: modelarts.TotalVersionCount,
  805. }
  806. //将params转换Parameters.Parameter,出错时返回给前端
  807. var Parameters modelarts.Parameters
  808. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  809. ctx.ServerError("json.Unmarshal failed:", err)
  810. return
  811. }
  812. err = modelarts.GenerateTrainJob(ctx, req)
  813. if err != nil {
  814. log.Error("GenerateTrainJob failed:%v", err.Error())
  815. trainJobErrorNewDataPrepare(ctx, form)
  816. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  817. return
  818. }
  819. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  820. }
  821. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  822. ctx.Data["PageIsTrainJob"] = true
  823. var jobID = ctx.Params(":jobid")
  824. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  825. if err != nil {
  826. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  827. versionErrorDataPrepare(ctx, form)
  828. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  829. return
  830. } else {
  831. if count >= 1 {
  832. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  833. versionErrorDataPrepare(ctx, form)
  834. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  835. return
  836. }
  837. }
  838. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  839. if err != nil {
  840. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  841. return
  842. }
  843. VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(latestTask.TotalVersionCount + 1)
  844. jobName := form.JobName
  845. uuid := form.Attachment
  846. description := form.Description
  847. workServerNumber := form.WorkServerNumber
  848. engineID := form.EngineID
  849. bootFile := form.BootFile
  850. flavorCode := form.Flavor
  851. params := form.Params
  852. poolID := form.PoolID
  853. isSaveParam := form.IsSaveParam
  854. repo := ctx.Repo.Repository
  855. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  856. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  857. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  858. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  859. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  860. branch_name := form.BranchName
  861. PreVersionName := form.VersionName
  862. FlavorName := form.FlavorName
  863. EngineName := form.EngineName
  864. isLatestVersion := modelarts.IsLatestVersion
  865. //判断权限
  866. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  867. if !canNewJob {
  868. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  869. return
  870. }
  871. if err := paramCheckCreateTrainJob(form); err != nil {
  872. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  873. versionErrorDataPrepare(ctx, form)
  874. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  875. return
  876. }
  877. // attach, err := models.GetAttachmentByUUID(uuid)
  878. // if err != nil {
  879. // log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
  880. // return
  881. // }
  882. //todo: del the codeLocalPath
  883. // _, err = ioutil.ReadDir(codeLocalPath)
  884. // if err == nil {
  885. // os.RemoveAll(codeLocalPath)
  886. // }
  887. os.RemoveAll(codeLocalPath)
  888. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  889. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  890. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  891. Branch: branch_name,
  892. }); err != nil {
  893. log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
  894. versionErrorDataPrepare(ctx, form)
  895. ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
  896. return
  897. }
  898. //todo: upload code (send to file_server todo this work?)
  899. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  900. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  901. versionErrorDataPrepare(ctx, form)
  902. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  903. return
  904. }
  905. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  906. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  907. versionErrorDataPrepare(ctx, form)
  908. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  909. return
  910. }
  911. parentDir := VersionOutputPath + "/"
  912. // parentDir := ""
  913. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  914. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  915. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  916. versionErrorDataPrepare(ctx, form)
  917. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
  918. return
  919. }
  920. //todo: del local code?
  921. var parameters models.Parameters
  922. param := make([]models.Parameter, 0)
  923. param = append(param, models.Parameter{
  924. Label: modelarts.TrainUrl,
  925. Value: outputObsPath,
  926. }, models.Parameter{
  927. Label: modelarts.DataUrl,
  928. Value: dataPath,
  929. })
  930. if len(params) != 0 {
  931. err := json.Unmarshal([]byte(params), &parameters)
  932. if err != nil {
  933. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  934. versionErrorDataPrepare(ctx, form)
  935. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  936. return
  937. }
  938. for _, parameter := range parameters.Parameter {
  939. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  940. param = append(param, models.Parameter{
  941. Label: parameter.Label,
  942. Value: parameter.Value,
  943. })
  944. }
  945. }
  946. }
  947. //save param config
  948. if isSaveParam == "on" {
  949. if form.ParameterTemplateName == "" {
  950. log.Error("ParameterTemplateName is empty")
  951. versionErrorDataPrepare(ctx, form)
  952. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
  953. return
  954. }
  955. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  956. ConfigName: form.ParameterTemplateName,
  957. Description: form.PrameterDescription,
  958. DataUrl: dataPath,
  959. AppUrl: codeObsPath,
  960. BootFileUrl: codeObsPath + bootFile,
  961. TrainUrl: outputObsPath,
  962. Flavor: models.Flavor{
  963. Code: flavorCode,
  964. },
  965. WorkServerNum: workServerNumber,
  966. EngineID: int64(engineID),
  967. LogUrl: logObsPath,
  968. PoolID: poolID,
  969. Parameter: parameters.Parameter,
  970. })
  971. if err != nil {
  972. log.Error("Failed to CreateTrainJobConfig: %v", err)
  973. versionErrorDataPrepare(ctx, form)
  974. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  975. return
  976. }
  977. }
  978. if err != nil {
  979. log.Error("getFlavorNameByEngineID(%s) failed:%v", engineID, err.Error())
  980. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  981. return
  982. }
  983. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  984. if err != nil {
  985. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  986. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  987. return
  988. }
  989. req := &modelarts.GenerateTrainJobReq{
  990. JobName: task.JobName,
  991. DataUrl: dataPath,
  992. Description: description,
  993. CodeObsPath: codeObsPath,
  994. BootFileUrl: codeObsPath + bootFile,
  995. BootFile: bootFile,
  996. TrainUrl: outputObsPath,
  997. FlavorCode: flavorCode,
  998. WorkServerNumber: workServerNumber,
  999. IsLatestVersion: isLatestVersion,
  1000. EngineID: int64(engineID),
  1001. LogUrl: logObsPath,
  1002. PoolID: poolID,
  1003. Uuid: uuid,
  1004. Params: form.Params,
  1005. Parameters: parameters.Parameter,
  1006. PreVersionId: task.VersionID,
  1007. CommitID: commitID,
  1008. BranchName: branch_name,
  1009. FlavorName: FlavorName,
  1010. EngineName: EngineName,
  1011. PreVersionName: PreVersionName,
  1012. TotalVersionCount: latestTask.TotalVersionCount + 1,
  1013. }
  1014. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  1015. if err != nil {
  1016. log.Error("GenerateTrainJob failed:%v", err.Error())
  1017. versionErrorDataPrepare(ctx, form)
  1018. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1019. return
  1020. }
  1021. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  1022. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1023. }
  1024. // readDir reads the directory named by dirname and returns
  1025. // a list of directory entries sorted by filename.
  1026. func readDir(dirname string) ([]os.FileInfo, error) {
  1027. f, err := os.Open(dirname)
  1028. if err != nil {
  1029. return nil, err
  1030. }
  1031. list, err := f.Readdir(100)
  1032. f.Close()
  1033. if err != nil {
  1034. //todo: can not upload empty folder
  1035. if err == io.EOF {
  1036. return nil, nil
  1037. }
  1038. return nil, err
  1039. }
  1040. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  1041. return list, nil
  1042. }
  1043. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  1044. files, err := readDir(codePath)
  1045. if err != nil {
  1046. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  1047. return err
  1048. }
  1049. for _, file := range files {
  1050. if file.IsDir() {
  1051. input := &obs.PutObjectInput{}
  1052. input.Bucket = setting.Bucket
  1053. input.Key = parentDir + file.Name() + "/"
  1054. _, err = storage.ObsCli.PutObject(input)
  1055. if err != nil {
  1056. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1057. return err
  1058. }
  1059. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1060. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1061. return err
  1062. }
  1063. } else {
  1064. input := &obs.PutFileInput{}
  1065. input.Bucket = setting.Bucket
  1066. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1067. input.SourceFile = codePath + file.Name()
  1068. _, err = storage.ObsCli.PutFile(input)
  1069. if err != nil {
  1070. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1071. return err
  1072. }
  1073. }
  1074. }
  1075. return nil
  1076. }
  1077. func obsMkdir(dir string) error {
  1078. input := &obs.PutObjectInput{}
  1079. input.Bucket = setting.Bucket
  1080. input.Key = dir
  1081. _, err := storage.ObsCli.PutObject(input)
  1082. if err != nil {
  1083. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1084. return err
  1085. }
  1086. return nil
  1087. }
  1088. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1089. if !strings.HasSuffix(form.BootFile, ".py") {
  1090. log.Error("the boot file(%s) must be a python file", form.BootFile)
  1091. return errors.New("启动文件必须是python文件")
  1092. }
  1093. if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
  1094. log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
  1095. return errors.New("计算节点数必须在1-25之间")
  1096. }
  1097. return nil
  1098. }
  1099. func TrainJobShow(ctx *context.Context) {
  1100. ctx.Data["PageIsCloudBrain"] = true
  1101. var jobID = ctx.Params(":jobid")
  1102. repo := ctx.Repo.Repository
  1103. page := ctx.QueryInt("page")
  1104. if page <= 0 {
  1105. page = 1
  1106. }
  1107. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1108. ListOptions: models.ListOptions{
  1109. Page: page,
  1110. PageSize: setting.UI.IssuePagingNum,
  1111. },
  1112. RepoID: repo.ID,
  1113. Type: models.TypeCloudBrainTwo,
  1114. JobType: string(models.JobTypeTrain),
  1115. JobID: jobID,
  1116. })
  1117. if err != nil {
  1118. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1119. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1120. return
  1121. }
  1122. //设置权限
  1123. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1124. if err != nil {
  1125. ctx.ServerError("canNewJob failed", err)
  1126. return
  1127. }
  1128. ctx.Data["canNewJob"] = canNewJob
  1129. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1130. for i, task := range VersionListTasks {
  1131. var parameters models.Parameters
  1132. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1133. if err != nil {
  1134. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1135. trainJobNewDataPrepare(ctx)
  1136. return
  1137. }
  1138. if len(parameters.Parameter) > 0 {
  1139. paramTemp := ""
  1140. for _, Parameter := range parameters.Parameter {
  1141. param := Parameter.Label + " = " + Parameter.Value + "; "
  1142. paramTemp = paramTemp + param
  1143. }
  1144. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1145. } else {
  1146. VersionListTasks[i].Parameters = ""
  1147. }
  1148. VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1149. VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1150. }
  1151. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1152. pager.SetDefaultParams(ctx)
  1153. ctx.Data["Page"] = pager
  1154. ctx.Data["jobID"] = jobID
  1155. ctx.Data["jobName"] = VersionListTasks[0].JobName
  1156. ctx.Data["version_list_task"] = VersionListTasks
  1157. ctx.Data["version_list_count"] = VersionListCount
  1158. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1159. }
  1160. func TrainJobGetLog(ctx *context.Context) {
  1161. ctx.Data["PageIsTrainJob"] = true
  1162. var jobID = ctx.Params(":jobid")
  1163. var logFileName = ctx.Query("file_name")
  1164. var baseLine = ctx.Query("base_line")
  1165. var order = ctx.Query("order")
  1166. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1167. log.Error("order(%s) check failed", order)
  1168. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1169. return
  1170. }
  1171. task, err := models.GetCloudbrainByJobID(jobID)
  1172. if err != nil {
  1173. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1174. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1175. return
  1176. }
  1177. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1178. if err != nil {
  1179. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1180. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1181. return
  1182. }
  1183. ctx.Data["log"] = result
  1184. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1185. }
  1186. func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  1187. task, err := models.GetCloudbrainByJobID(jobID)
  1188. if err != nil {
  1189. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1190. return nil, nil, err
  1191. }
  1192. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  1193. if err != nil {
  1194. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  1195. return nil, nil, err
  1196. }
  1197. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
  1198. if err != nil {
  1199. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1200. return nil, nil, err
  1201. }
  1202. return resultLogFile, result, err
  1203. }
  1204. func TrainJobDel(ctx *context.Context) {
  1205. var jobID = ctx.Params(":jobid")
  1206. repo := ctx.Repo.Repository
  1207. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1208. RepoID: repo.ID,
  1209. Type: models.TypeCloudBrainTwo,
  1210. JobType: string(models.JobTypeTrain),
  1211. JobID: jobID,
  1212. })
  1213. if err != nil {
  1214. ctx.ServerError("get VersionListTasks failed", err)
  1215. return
  1216. }
  1217. //删除modelarts上的任务记录
  1218. _, err = modelarts.DelTrainJob(jobID)
  1219. if err != nil {
  1220. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1221. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1222. return
  1223. }
  1224. //删除数据库Cloudbrain表的记录
  1225. for _, task := range VersionListTasks {
  1226. err = models.DeleteJob(&task.Cloudbrain)
  1227. if err != nil {
  1228. ctx.ServerError("DeleteJob failed", err)
  1229. return
  1230. }
  1231. }
  1232. //删除存储
  1233. if len(VersionListTasks) > 0 {
  1234. DeleteJobStorage(VersionListTasks[0].JobName)
  1235. }
  1236. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1237. }
  1238. func TrainJobStop(ctx *context.Context) {
  1239. var jobID = ctx.Params(":jobid")
  1240. task, err := models.GetCloudbrainByJobID(jobID)
  1241. if err != nil {
  1242. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1243. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1244. return
  1245. }
  1246. _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1247. if err != nil {
  1248. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1249. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1250. return
  1251. }
  1252. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1253. }
  1254. func canUserCreateTrainJob(uid int64) (bool, error) {
  1255. org, err := models.GetOrgByName(setting.AllowedOrg)
  1256. if err != nil {
  1257. log.Error("get allowed org failed: ", setting.AllowedOrg)
  1258. return false, err
  1259. }
  1260. return org.IsOrgMember(uid)
  1261. }
  1262. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1263. if ctx == nil || ctx.User == nil {
  1264. log.Error("user unlogin!")
  1265. return false, nil
  1266. }
  1267. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1268. return true, nil
  1269. } else {
  1270. log.Error("Only user itself and admin can new trainjob!")
  1271. return false, nil
  1272. }
  1273. }
  1274. func TrainJobGetConfigList(ctx *context.Context) {
  1275. ctx.Data["PageIsTrainJob"] = true
  1276. var jobID = ctx.Params(":jobid")
  1277. var logFileName = ctx.Query("file_name")
  1278. var baseLine = ctx.Query("base_line")
  1279. var order = ctx.Query("order")
  1280. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1281. log.Error("order(%s) check failed", order)
  1282. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1283. return
  1284. }
  1285. task, err := models.GetCloudbrainByJobID(jobID)
  1286. if err != nil {
  1287. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1288. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1289. return
  1290. }
  1291. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1292. if err != nil {
  1293. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1294. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1295. return
  1296. }
  1297. ctx.Data["log"] = result
  1298. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1299. }
  1300. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1301. var result models.GetConfigListResult
  1302. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1303. if err != nil {
  1304. log.Error("GetConfigList failed:", err)
  1305. return &result, err
  1306. }
  1307. for _, config := range list.ParaConfigs {
  1308. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1309. if err != nil {
  1310. log.Error("GetParaConfig failed:", err)
  1311. return &result, err
  1312. }
  1313. config.Result = paraConfig
  1314. }
  1315. return list, nil
  1316. }
  1317. func ModelDownload(ctx *context.Context) {
  1318. var (
  1319. err error
  1320. )
  1321. var jobID = ctx.Params(":jobid")
  1322. versionName := ctx.Query("version_name")
  1323. parentDir := ctx.Query("parent_dir")
  1324. fileName := ctx.Query("file_name")
  1325. log.Info("DownloadSingleModelFile start.")
  1326. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  1327. if err != nil {
  1328. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1329. return
  1330. }
  1331. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  1332. log.Info("Download path is:%s", path)
  1333. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  1334. if err != nil {
  1335. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  1336. ctx.ServerError("GetObsCreateSignedUrl", err)
  1337. return
  1338. }
  1339. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  1340. }
  1341. func DeleteJobStorage(jobName string) error {
  1342. //delete local
  1343. localJobPath := setting.JobPath + jobName
  1344. err := os.RemoveAll(localJobPath)
  1345. if err != nil {
  1346. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  1347. }
  1348. //delete oss
  1349. dirPath := setting.CodePathPrefix + jobName + "/"
  1350. err = storage.ObsRemoveObject(setting.Bucket, dirPath)
  1351. if err != nil {
  1352. log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
  1353. }
  1354. return nil
  1355. }