You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 71 kB

4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package repo
  2. import (
  3. "archive/zip"
  4. "encoding/json"
  5. "errors"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "unicode/utf8"
  15. "code.gitea.io/gitea/models"
  16. "code.gitea.io/gitea/modules/auth"
  17. "code.gitea.io/gitea/modules/base"
  18. "code.gitea.io/gitea/modules/cloudbrain"
  19. "code.gitea.io/gitea/modules/context"
  20. "code.gitea.io/gitea/modules/git"
  21. "code.gitea.io/gitea/modules/log"
  22. "code.gitea.io/gitea/modules/modelarts"
  23. "code.gitea.io/gitea/modules/obs"
  24. "code.gitea.io/gitea/modules/setting"
  25. "code.gitea.io/gitea/modules/storage"
  26. )
  27. const (
  28. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  29. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  30. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  31. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  32. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  33. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  34. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  35. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  36. tplModelArtsInferenceJobIndex base.TplName = "repo/modelarts/inferencejob/index"
  37. tplModelArtsInferenceJobNew base.TplName = "repo/modelarts/inferencejob/new"
  38. tplModelArtsInferenceJobShow base.TplName = "repo/modelarts/inferencejob/show"
  39. )
  40. func DebugJobIndex(ctx *context.Context) {
  41. debugListType := ctx.Query("debugListType")
  42. ctx.Data["ListType"] = debugListType
  43. MustEnableCloudbrain(ctx)
  44. repo := ctx.Repo.Repository
  45. page := ctx.QueryInt("page")
  46. if page <= 0 {
  47. page = 1
  48. }
  49. debugType := modelarts.DebugType
  50. jobTypeNot := false
  51. if debugListType == models.GPUResource {
  52. debugType = models.TypeCloudBrainOne
  53. } else if debugListType == models.NPUResource {
  54. debugType = models.TypeCloudBrainTwo
  55. }
  56. var jobTypes []string
  57. jobTypes = append(jobTypes, string(models.JobTypeSnn4imagenet), string(models.JobTypeBrainScore), string(models.JobTypeDebug))
  58. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  59. ListOptions: models.ListOptions{
  60. Page: page,
  61. PageSize: setting.UI.IssuePagingNum,
  62. },
  63. RepoID: repo.ID,
  64. Type: debugType,
  65. JobTypeNot: jobTypeNot,
  66. JobTypes: jobTypes,
  67. })
  68. if err != nil {
  69. ctx.ServerError("Get debugjob faild:", err)
  70. return
  71. }
  72. for i, task := range ciTasks {
  73. ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  74. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  75. ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
  76. }
  77. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  78. pager.AddParam(ctx, "debugListType", "ListType")
  79. ctx.Data["Page"] = pager
  80. ctx.Data["PageIsCloudBrain"] = true
  81. ctx.Data["Tasks"] = ciTasks
  82. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  83. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  84. ctx.HTML(200, tplDebugJobIndex)
  85. }
  86. // MustEnableDataset check if repository enable internal cb
  87. func MustEnableModelArts(ctx *context.Context) {
  88. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  89. ctx.NotFound("MustEnableCloudbrain", nil)
  90. return
  91. }
  92. }
  93. func NotebookNew(ctx *context.Context) {
  94. notebookNewDataPrepare(ctx)
  95. ctx.HTML(200, tplModelArtsNotebookNew)
  96. }
  97. func notebookNewDataPrepare(ctx *context.Context) error {
  98. ctx.Data["PageIsCloudBrain"] = true
  99. t := time.Now()
  100. var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  101. ctx.Data["job_name"] = jobName
  102. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  103. if err != nil {
  104. ctx.ServerError("GetAllUserAttachments failed:", err)
  105. return err
  106. }
  107. ctx.Data["attachments"] = attachs
  108. if modelarts.ImageInfos == nil {
  109. json.Unmarshal([]byte(setting.ImageInfos), &modelarts.ImageInfos)
  110. }
  111. ctx.Data["images"] = modelarts.ImageInfos.ImageInfo
  112. if modelarts.FlavorInfos == nil {
  113. json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
  114. }
  115. ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
  116. return nil
  117. }
  118. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  119. ctx.Data["PageIsNotebook"] = true
  120. jobName := form.JobName
  121. uuid := form.Attachment
  122. description := form.Description
  123. flavor := form.Flavor
  124. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  125. if err != nil {
  126. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  127. cloudBrainNewDataPrepare(ctx)
  128. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  129. return
  130. } else {
  131. if count >= 1 {
  132. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  133. cloudBrainNewDataPrepare(ctx)
  134. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  135. return
  136. }
  137. }
  138. _, err = models.GetCloudbrainByName(jobName)
  139. if err == nil {
  140. log.Error("the job name did already exist", ctx.Data["MsgID"])
  141. cloudBrainNewDataPrepare(ctx)
  142. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  143. return
  144. } else {
  145. if !models.IsErrJobNotExist(err) {
  146. log.Error("system error, %v", err, ctx.Data["MsgID"])
  147. cloudBrainNewDataPrepare(ctx)
  148. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  149. return
  150. }
  151. }
  152. err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  153. if err != nil {
  154. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  155. return
  156. }
  157. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  158. }
  159. func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  160. ctx.Data["PageIsNotebook"] = true
  161. jobName := form.JobName
  162. uuid := form.Attachment
  163. description := form.Description
  164. flavor := form.Flavor
  165. imageId := form.ImageId
  166. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  167. if err != nil {
  168. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  169. notebookNewDataPrepare(ctx)
  170. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  171. return
  172. } else {
  173. if count >= 1 {
  174. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  175. notebookNewDataPrepare(ctx)
  176. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  177. return
  178. }
  179. }
  180. _, err = models.GetCloudbrainByName(jobName)
  181. if err == nil {
  182. log.Error("the job name did already exist", ctx.Data["MsgID"])
  183. notebookNewDataPrepare(ctx)
  184. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  185. return
  186. } else {
  187. if !models.IsErrJobNotExist(err) {
  188. log.Error("system error, %v", err, ctx.Data["MsgID"])
  189. notebookNewDataPrepare(ctx)
  190. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  191. return
  192. }
  193. }
  194. err = modelarts.GenerateNotebook2(ctx, jobName, uuid, description, flavor, imageId)
  195. if err != nil {
  196. log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
  197. notebookNewDataPrepare(ctx)
  198. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  199. return
  200. }
  201. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  202. }
  203. func NotebookShow(ctx *context.Context) {
  204. ctx.Data["PageIsCloudBrain"] = true
  205. var jobID = ctx.Params(":jobid")
  206. task, err := models.GetCloudbrainByJobID(jobID)
  207. if err != nil {
  208. ctx.Data["error"] = err.Error()
  209. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  210. return
  211. }
  212. result, err := modelarts.GetNotebook2(jobID)
  213. if err != nil {
  214. ctx.Data["error"] = err.Error()
  215. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  216. return
  217. }
  218. if result != nil {
  219. task.Status = result.Status
  220. err = models.UpdateJob(task)
  221. if err != nil {
  222. ctx.Data["error"] = err.Error()
  223. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  224. return
  225. }
  226. result.CreateTime = time.Unix(int64(result.CreateAt/1000), 0).Format("2006-01-02 15:04:05")
  227. result.LatestUpdateTime = time.Unix(int64(result.UpdateAt/1000), 0).Format("2006-01-02 15:04:05")
  228. }
  229. datasetDownloadLink := "-"
  230. if ctx.IsSigned {
  231. if task.Uuid != "" && task.UserID == ctx.User.ID {
  232. attachment, err := models.GetAttachmentByUUID(task.Uuid)
  233. if err == nil {
  234. datasetDownloadLink = attachment.S3DownloadURL()
  235. }
  236. }
  237. }
  238. ctx.Data["datasetDownloadLink"] = datasetDownloadLink
  239. ctx.Data["task"] = task
  240. ctx.Data["jobID"] = jobID
  241. ctx.Data["jobName"] = task.JobName
  242. ctx.Data["result"] = result
  243. ctx.HTML(200, tplModelArtsNotebookShow)
  244. }
  245. func NotebookDebug(ctx *context.Context) {
  246. var jobID = ctx.Params(":jobid")
  247. result, err := modelarts.GetJob(jobID)
  248. if err != nil {
  249. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  250. return
  251. }
  252. res, err := modelarts.GetJobToken(jobID)
  253. if err != nil {
  254. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  255. return
  256. }
  257. urls := strings.Split(result.Spec.Annotations.Url, "/")
  258. urlPrefix := result.Spec.Annotations.TargetDomain
  259. for i, url := range urls {
  260. if i > 2 {
  261. urlPrefix += "/" + url
  262. }
  263. }
  264. debugUrl := urlPrefix + "?token=" + res.Token
  265. ctx.Redirect(debugUrl)
  266. }
  267. func NotebookDebug2(ctx *context.Context) {
  268. var jobID = ctx.Params(":jobid")
  269. result, err := modelarts.GetNotebook2(jobID)
  270. if err != nil {
  271. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  272. return
  273. }
  274. ctx.Redirect(result.Url)
  275. }
  276. func NotebookManage(ctx *context.Context) {
  277. var jobID = ctx.Params(":jobid")
  278. var action = ctx.Params(":action")
  279. var resultCode = "0"
  280. var errorMsg = ""
  281. var status = ""
  282. for {
  283. task, err := models.GetCloudbrainByJobID(jobID)
  284. if err != nil {
  285. log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"])
  286. resultCode = "-1"
  287. errorMsg = "system error"
  288. break
  289. }
  290. if action == models.ActionStop {
  291. if task.Status != string(models.ModelArtsRunning) {
  292. log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
  293. resultCode = "-1"
  294. errorMsg = "the job is not running"
  295. break
  296. }
  297. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) {
  298. log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"])
  299. resultCode = "-1"
  300. errorMsg = "you have no right to stop the job"
  301. break
  302. }
  303. } else if action == models.ActionRestart {
  304. if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
  305. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  306. resultCode = "-1"
  307. errorMsg = "the job is not stopped"
  308. break
  309. }
  310. if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
  311. log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
  312. resultCode = "-1"
  313. errorMsg = "you have no right to restart the job"
  314. break
  315. }
  316. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  317. if err != nil {
  318. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  319. resultCode = "-1"
  320. errorMsg = "system error"
  321. break
  322. } else {
  323. if count >= 1 {
  324. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  325. resultCode = "-1"
  326. errorMsg = "you have already a running or waiting task, can not create more"
  327. break
  328. }
  329. }
  330. action = models.ActionStart
  331. } else {
  332. log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
  333. resultCode = "-1"
  334. errorMsg = "非法操作"
  335. break
  336. }
  337. param := models.NotebookAction{
  338. Action: action,
  339. }
  340. res, err := modelarts.ManageNotebook2(jobID, param)
  341. if err != nil {
  342. log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  343. resultCode = "-1"
  344. errorMsg = err.Error()
  345. if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
  346. errorMsg = "the job's version is too old and can not be restarted"
  347. }
  348. break
  349. }
  350. task.Status = res.Status
  351. err = models.UpdateJob(task)
  352. if err != nil {
  353. log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  354. resultCode = "-1"
  355. errorMsg = "system error"
  356. break
  357. }
  358. status = task.Status
  359. break
  360. }
  361. ctx.JSON(200, map[string]string{
  362. "result_code": resultCode,
  363. "error_msg": errorMsg,
  364. "status": status,
  365. "job_id": jobID,
  366. })
  367. }
  368. func NotebookDel(ctx *context.Context) {
  369. var jobID = ctx.Params(":jobid")
  370. var listType = ctx.Query("debugListType")
  371. task := ctx.Cloudbrain
  372. if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) {
  373. log.Error("the job(%s) has not been stopped", task.JobName)
  374. ctx.ServerError("the job has not been stopped", errors.New("the job has not been stopped"))
  375. return
  376. }
  377. _, err := modelarts.DelNotebook2(jobID)
  378. if err != nil {
  379. log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error())
  380. if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
  381. log.Info("old notebook version")
  382. } else {
  383. ctx.ServerError("DelNotebook2 failed", err)
  384. return
  385. }
  386. }
  387. err = models.DeleteJob(task)
  388. if err != nil {
  389. ctx.ServerError("DeleteJob failed", err)
  390. return
  391. }
  392. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  393. }
  394. func TrainJobIndex(ctx *context.Context) {
  395. MustEnableModelArts(ctx)
  396. repo := ctx.Repo.Repository
  397. page := ctx.QueryInt("page")
  398. if page <= 0 {
  399. page = 1
  400. }
  401. var jobTypes []string
  402. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  403. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  404. ListOptions: models.ListOptions{
  405. Page: page,
  406. PageSize: setting.UI.IssuePagingNum,
  407. },
  408. RepoID: repo.ID,
  409. Type: models.TypeCloudBrainTwo,
  410. JobTypeNot: false,
  411. JobTypes: jobTypes,
  412. IsLatestVersion: modelarts.IsLatestVersion,
  413. })
  414. if err != nil {
  415. ctx.ServerError("Cloudbrain", err)
  416. return
  417. }
  418. for i, task := range tasks {
  419. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  420. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  421. tasks[i].ComputeResource = models.NPUResource
  422. }
  423. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  424. pager.SetDefaultParams(ctx)
  425. ctx.Data["Page"] = pager
  426. ctx.Data["PageIsCloudBrain"] = true
  427. ctx.Data["Tasks"] = tasks
  428. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  429. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  430. ctx.HTML(200, tplModelArtsTrainJobIndex)
  431. }
  432. func TrainJobNew(ctx *context.Context) {
  433. err := trainJobNewDataPrepare(ctx)
  434. if err != nil {
  435. ctx.ServerError("get new train-job info failed", err)
  436. return
  437. }
  438. ctx.HTML(200, tplModelArtsTrainJobNew)
  439. }
  440. func trainJobNewDataPrepare(ctx *context.Context) error {
  441. ctx.Data["PageIsCloudBrain"] = true
  442. //can, err := canUserCreateTrainJob(ctx.User.ID)
  443. //if err != nil {
  444. // ctx.ServerError("canUserCreateTrainJob", err)
  445. // return
  446. //}
  447. //
  448. //if !can {
  449. // log.Error("the user can not create train-job")
  450. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  451. // return
  452. //}
  453. t := time.Now()
  454. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  455. ctx.Data["job_name"] = jobName
  456. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  457. if err != nil {
  458. ctx.ServerError("GetAllUserAttachments failed:", err)
  459. return err
  460. }
  461. ctx.Data["attachments"] = attachs
  462. var resourcePools modelarts.ResourcePool
  463. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  464. ctx.ServerError("json.Unmarshal failed:", err)
  465. return err
  466. }
  467. ctx.Data["resource_pools"] = resourcePools.Info
  468. var engines modelarts.Engine
  469. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  470. ctx.ServerError("json.Unmarshal failed:", err)
  471. return err
  472. }
  473. ctx.Data["engines"] = engines.Info
  474. var versionInfos modelarts.VersionInfo
  475. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  476. ctx.ServerError("json.Unmarshal failed:", err)
  477. return err
  478. }
  479. ctx.Data["engine_versions"] = versionInfos.Version
  480. var flavorInfos modelarts.Flavor
  481. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  482. ctx.ServerError("json.Unmarshal failed:", err)
  483. return err
  484. }
  485. ctx.Data["flavor_infos"] = flavorInfos.Info
  486. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  487. ctx.Data["train_url"] = outputObsPath
  488. ctx.Data["params"] = ""
  489. ctx.Data["branchName"] = ctx.Repo.BranchName
  490. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  491. if err != nil {
  492. ctx.ServerError("getConfigList failed:", err)
  493. return err
  494. }
  495. ctx.Data["config_list"] = configList.ParaConfigs
  496. return nil
  497. }
  498. func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  499. ctx.Data["PageIsCloudBrain"] = true
  500. //can, err := canUserCreateTrainJob(ctx.User.ID)
  501. //if err != nil {
  502. // ctx.ServerError("canUserCreateTrainJob", err)
  503. // return
  504. //}
  505. //
  506. //if !can {
  507. // log.Error("the user can not create train-job")
  508. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  509. // return
  510. //}
  511. t := time.Now()
  512. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  513. ctx.Data["job_name"] = jobName
  514. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  515. if err != nil {
  516. ctx.ServerError("GetAllUserAttachments failed:", err)
  517. return err
  518. }
  519. ctx.Data["attachments"] = attachs
  520. var resourcePools modelarts.ResourcePool
  521. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  522. ctx.ServerError("json.Unmarshal failed:", err)
  523. return err
  524. }
  525. ctx.Data["resource_pools"] = resourcePools.Info
  526. var engines modelarts.Engine
  527. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  528. ctx.ServerError("json.Unmarshal failed:", err)
  529. return err
  530. }
  531. ctx.Data["engines"] = engines.Info
  532. var versionInfos modelarts.VersionInfo
  533. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  534. ctx.ServerError("json.Unmarshal failed:", err)
  535. return err
  536. }
  537. ctx.Data["engine_versions"] = versionInfos.Version
  538. var flavorInfos modelarts.Flavor
  539. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  540. ctx.ServerError("json.Unmarshal failed:", err)
  541. return err
  542. }
  543. ctx.Data["flavor_infos"] = flavorInfos.Info
  544. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  545. ctx.Data["train_url"] = outputObsPath
  546. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  547. if err != nil {
  548. ctx.ServerError("getConfigList failed:", err)
  549. return err
  550. }
  551. var Parameters modelarts.Parameters
  552. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  553. ctx.ServerError("json.Unmarshal failed:", err)
  554. return err
  555. }
  556. ctx.Data["params"] = Parameters.Parameter
  557. ctx.Data["config_list"] = configList.ParaConfigs
  558. ctx.Data["bootFile"] = form.BootFile
  559. ctx.Data["uuid"] = form.Attachment
  560. ctx.Data["branch_name"] = form.BranchName
  561. return nil
  562. }
  563. func TrainJobNewVersion(ctx *context.Context) {
  564. err := trainJobNewVersionDataPrepare(ctx)
  565. if err != nil {
  566. ctx.ServerError("get new train-job info failed", err)
  567. return
  568. }
  569. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  570. }
  571. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  572. ctx.Data["PageIsCloudBrain"] = true
  573. var jobID = ctx.Params(":jobid")
  574. var versionName = ctx.Query("version_name")
  575. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  576. // if err != nil {
  577. // ctx.ServerError("canNewJob can info failed", err)
  578. // return err
  579. // }
  580. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  581. if err != nil {
  582. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  583. return err
  584. }
  585. t := time.Now()
  586. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  587. ctx.Data["job_name"] = task.JobName
  588. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  589. if err != nil {
  590. ctx.ServerError("GetAllUserAttachments failed:", err)
  591. return err
  592. }
  593. ctx.Data["attachments"] = attachs
  594. var resourcePools modelarts.ResourcePool
  595. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  596. ctx.ServerError("json.Unmarshal failed:", err)
  597. return err
  598. }
  599. ctx.Data["resource_pools"] = resourcePools.Info
  600. var engines modelarts.Engine
  601. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  602. ctx.ServerError("json.Unmarshal failed:", err)
  603. return err
  604. }
  605. ctx.Data["engines"] = engines.Info
  606. var versionInfos modelarts.VersionInfo
  607. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  608. ctx.ServerError("json.Unmarshal failed:", err)
  609. return err
  610. }
  611. ctx.Data["engine_versions"] = versionInfos.Version
  612. var flavorInfos modelarts.Flavor
  613. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  614. ctx.ServerError("json.Unmarshal failed:", err)
  615. return err
  616. }
  617. ctx.Data["flavor_infos"] = flavorInfos.Info
  618. var Parameters modelarts.Parameters
  619. if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
  620. ctx.ServerError("json.Unmarshal failed:", err)
  621. return err
  622. }
  623. ctx.Data["params"] = Parameters.Parameter
  624. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  625. ctx.Data["train_url"] = outputObsPath
  626. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  627. if err != nil {
  628. ctx.ServerError("GetBranches error:", err)
  629. return err
  630. }
  631. ctx.Data["branches"] = branches
  632. ctx.Data["branch_name"] = task.BranchName
  633. ctx.Data["description"] = task.Description
  634. ctx.Data["boot_file"] = task.BootFile
  635. ctx.Data["dataset_name"] = task.DatasetName
  636. ctx.Data["work_server_number"] = task.WorkServerNumber
  637. ctx.Data["flavor_name"] = task.FlavorName
  638. ctx.Data["engine_name"] = task.EngineName
  639. ctx.Data["uuid"] = task.Uuid
  640. ctx.Data["flavor_code"] = task.FlavorCode
  641. ctx.Data["engine_id"] = task.EngineID
  642. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  643. if err != nil {
  644. ctx.ServerError("getConfigList failed:", err)
  645. return err
  646. }
  647. ctx.Data["config_list"] = configList.ParaConfigs
  648. return nil
  649. }
  650. func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  651. ctx.Data["PageIsCloudBrain"] = true
  652. var jobID = ctx.Params(":jobid")
  653. // var versionName = ctx.Params(":version-name")
  654. var versionName = ctx.Query("version_name")
  655. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  656. if err != nil {
  657. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  658. return err
  659. }
  660. t := time.Now()
  661. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  662. ctx.Data["job_name"] = task.JobName
  663. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  664. if err != nil {
  665. ctx.ServerError("GetAllUserAttachments failed:", err)
  666. return err
  667. }
  668. ctx.Data["attachments"] = attachs
  669. var resourcePools modelarts.ResourcePool
  670. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  671. ctx.ServerError("json.Unmarshal failed:", err)
  672. return err
  673. }
  674. ctx.Data["resource_pools"] = resourcePools.Info
  675. var engines modelarts.Engine
  676. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  677. ctx.ServerError("json.Unmarshal failed:", err)
  678. return err
  679. }
  680. ctx.Data["engines"] = engines.Info
  681. var versionInfos modelarts.VersionInfo
  682. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  683. ctx.ServerError("json.Unmarshal failed:", err)
  684. return err
  685. }
  686. ctx.Data["engine_versions"] = versionInfos.Version
  687. var flavorInfos modelarts.Flavor
  688. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  689. ctx.ServerError("json.Unmarshal failed:", err)
  690. return err
  691. }
  692. ctx.Data["flavor_infos"] = flavorInfos.Info
  693. var Parameters modelarts.Parameters
  694. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  695. ctx.ServerError("json.Unmarshal failed:", err)
  696. return err
  697. }
  698. ctx.Data["params"] = Parameters.Parameter
  699. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  700. ctx.Data["train_url"] = outputObsPath
  701. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  702. if err != nil {
  703. ctx.ServerError("GetBranches error:", err)
  704. return err
  705. }
  706. ctx.Data["branches"] = branches
  707. ctx.Data["description"] = form.Description
  708. ctx.Data["dataset_name"] = task.DatasetName
  709. ctx.Data["work_server_number"] = form.WorkServerNumber
  710. ctx.Data["flavor_name"] = form.FlavorName
  711. ctx.Data["engine_name"] = form.EngineName
  712. ctx.Data["flavor_code"] = task.FlavorCode
  713. ctx.Data["engine_id"] = task.EngineID
  714. ctx.Data["version_name"] = form.VersionName
  715. ctx.Data["bootFile"] = form.BootFile
  716. ctx.Data["uuid"] = form.Attachment
  717. ctx.Data["branch_name"] = form.BranchName
  718. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  719. if err != nil {
  720. ctx.ServerError("getConfigList failed:", err)
  721. return err
  722. }
  723. ctx.Data["config_list"] = configList.ParaConfigs
  724. return nil
  725. }
  726. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  727. ctx.Data["PageIsTrainJob"] = true
  728. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  729. jobName := form.JobName
  730. uuid := form.Attachment
  731. description := form.Description
  732. workServerNumber := form.WorkServerNumber
  733. engineID := form.EngineID
  734. bootFile := form.BootFile
  735. flavorCode := form.Flavor
  736. params := form.Params
  737. poolID := form.PoolID
  738. isSaveParam := form.IsSaveParam
  739. repo := ctx.Repo.Repository
  740. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  741. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  742. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  743. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  744. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  745. branch_name := form.BranchName
  746. isLatestVersion := modelarts.IsLatestVersion
  747. FlavorName := form.FlavorName
  748. VersionCount := modelarts.VersionCount
  749. EngineName := form.EngineName
  750. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  751. if err != nil {
  752. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  753. trainJobErrorNewDataPrepare(ctx, form)
  754. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  755. return
  756. } else {
  757. if count >= 1 {
  758. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  759. trainJobErrorNewDataPrepare(ctx, form)
  760. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  761. return
  762. }
  763. }
  764. if err := paramCheckCreateTrainJob(form); err != nil {
  765. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  766. trainJobErrorNewDataPrepare(ctx, form)
  767. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  768. return
  769. }
  770. //todo: del the codeLocalPath
  771. _, err = ioutil.ReadDir(codeLocalPath)
  772. if err == nil {
  773. os.RemoveAll(codeLocalPath)
  774. }
  775. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  776. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  777. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  778. Branch: branch_name,
  779. }); err != nil {
  780. log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err)
  781. trainJobErrorNewDataPrepare(ctx, form)
  782. ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsTrainJobNew, &form)
  783. return
  784. }
  785. //todo: upload code (send to file_server todo this work?)
  786. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  787. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  788. trainJobErrorNewDataPrepare(ctx, form)
  789. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  790. return
  791. }
  792. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  793. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  794. trainJobErrorNewDataPrepare(ctx, form)
  795. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  796. return
  797. }
  798. // parentDir := VersionOutputPath + "/"
  799. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  800. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  801. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  802. trainJobErrorNewDataPrepare(ctx, form)
  803. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
  804. return
  805. }
  806. //todo: del local code?
  807. var parameters models.Parameters
  808. param := make([]models.Parameter, 0)
  809. param = append(param, models.Parameter{
  810. Label: modelarts.TrainUrl,
  811. Value: outputObsPath,
  812. }, models.Parameter{
  813. Label: modelarts.DataUrl,
  814. Value: dataPath,
  815. })
  816. if len(params) != 0 {
  817. err := json.Unmarshal([]byte(params), &parameters)
  818. if err != nil {
  819. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  820. trainJobErrorNewDataPrepare(ctx, form)
  821. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  822. return
  823. }
  824. for _, parameter := range parameters.Parameter {
  825. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  826. param = append(param, models.Parameter{
  827. Label: parameter.Label,
  828. Value: parameter.Value,
  829. })
  830. }
  831. }
  832. }
  833. //save param config
  834. if isSaveParam == "on" {
  835. if form.ParameterTemplateName == "" {
  836. log.Error("ParameterTemplateName is empty")
  837. trainJobNewDataPrepare(ctx)
  838. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  839. return
  840. }
  841. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  842. ConfigName: form.ParameterTemplateName,
  843. Description: form.PrameterDescription,
  844. DataUrl: dataPath,
  845. AppUrl: codeObsPath,
  846. BootFileUrl: codeObsPath + bootFile,
  847. TrainUrl: outputObsPath,
  848. Flavor: models.Flavor{
  849. Code: flavorCode,
  850. },
  851. WorkServerNum: workServerNumber,
  852. EngineID: int64(engineID),
  853. LogUrl: logObsPath,
  854. PoolID: poolID,
  855. Parameter: param,
  856. })
  857. if err != nil {
  858. log.Error("Failed to CreateTrainJobConfig: %v", err)
  859. trainJobErrorNewDataPrepare(ctx, form)
  860. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  861. return
  862. }
  863. }
  864. req := &modelarts.GenerateTrainJobReq{
  865. JobName: jobName,
  866. DataUrl: dataPath,
  867. Description: description,
  868. CodeObsPath: codeObsPath,
  869. BootFileUrl: codeObsPath + bootFile,
  870. BootFile: bootFile,
  871. TrainUrl: outputObsPath,
  872. FlavorCode: flavorCode,
  873. WorkServerNumber: workServerNumber,
  874. EngineID: int64(engineID),
  875. LogUrl: logObsPath,
  876. PoolID: poolID,
  877. Uuid: uuid,
  878. Parameters: parameters.Parameter,
  879. CommitID: commitID,
  880. IsLatestVersion: isLatestVersion,
  881. BranchName: branch_name,
  882. Params: form.Params,
  883. FlavorName: FlavorName,
  884. EngineName: EngineName,
  885. VersionCount: VersionCount,
  886. TotalVersionCount: modelarts.TotalVersionCount,
  887. }
  888. //将params转换Parameters.Parameter,出错时返回给前端
  889. var Parameters modelarts.Parameters
  890. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  891. ctx.ServerError("json.Unmarshal failed:", err)
  892. return
  893. }
  894. err = modelarts.GenerateTrainJob(ctx, req)
  895. if err != nil {
  896. log.Error("GenerateTrainJob failed:%v", err.Error())
  897. trainJobErrorNewDataPrepare(ctx, form)
  898. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  899. return
  900. }
  901. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  902. }
  903. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  904. ctx.Data["PageIsTrainJob"] = true
  905. var jobID = ctx.Params(":jobid")
  906. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  907. if err != nil {
  908. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  909. versionErrorDataPrepare(ctx, form)
  910. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  911. return
  912. } else {
  913. if count >= 1 {
  914. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  915. versionErrorDataPrepare(ctx, form)
  916. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  917. return
  918. }
  919. }
  920. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  921. if err != nil {
  922. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  923. return
  924. }
  925. VersionOutputPath := modelarts.GetOutputPathByCount(latestTask.TotalVersionCount + 1)
  926. jobName := form.JobName
  927. uuid := form.Attachment
  928. description := form.Description
  929. workServerNumber := form.WorkServerNumber
  930. engineID := form.EngineID
  931. bootFile := form.BootFile
  932. flavorCode := form.Flavor
  933. params := form.Params
  934. poolID := form.PoolID
  935. isSaveParam := form.IsSaveParam
  936. repo := ctx.Repo.Repository
  937. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  938. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  939. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  940. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  941. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  942. branch_name := form.BranchName
  943. PreVersionName := form.VersionName
  944. FlavorName := form.FlavorName
  945. EngineName := form.EngineName
  946. isLatestVersion := modelarts.IsLatestVersion
  947. //判断权限
  948. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  949. if !canNewJob {
  950. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  951. return
  952. }
  953. if err := paramCheckCreateTrainJob(form); err != nil {
  954. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  955. versionErrorDataPrepare(ctx, form)
  956. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  957. return
  958. }
  959. //todo: del the codeLocalPath
  960. _, err = ioutil.ReadDir(codeLocalPath)
  961. if err == nil {
  962. os.RemoveAll(codeLocalPath)
  963. } else {
  964. log.Error("创建任务失败,原代码还未删除,请重试!: %s (%v)", repo.FullName(), err)
  965. versionErrorDataPrepare(ctx, form)
  966. ctx.RenderWithErr("创建任务失败,原代码还未删除,请重试!", tplModelArtsTrainJobVersionNew, &form)
  967. return
  968. }
  969. // os.RemoveAll(codeLocalPath)
  970. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  971. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  972. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  973. Branch: branch_name,
  974. }); err != nil {
  975. log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
  976. versionErrorDataPrepare(ctx, form)
  977. ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
  978. return
  979. }
  980. //todo: upload code (send to file_server todo this work?)
  981. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  982. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  983. versionErrorDataPrepare(ctx, form)
  984. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  985. return
  986. }
  987. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  988. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  989. versionErrorDataPrepare(ctx, form)
  990. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  991. return
  992. }
  993. parentDir := VersionOutputPath + "/"
  994. // parentDir := ""
  995. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  996. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  997. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  998. versionErrorDataPrepare(ctx, form)
  999. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
  1000. return
  1001. }
  1002. //todo: del local code?
  1003. var parameters models.Parameters
  1004. param := make([]models.Parameter, 0)
  1005. param = append(param, models.Parameter{
  1006. Label: modelarts.TrainUrl,
  1007. Value: outputObsPath,
  1008. }, models.Parameter{
  1009. Label: modelarts.DataUrl,
  1010. Value: dataPath,
  1011. })
  1012. if len(params) != 0 {
  1013. err := json.Unmarshal([]byte(params), &parameters)
  1014. if err != nil {
  1015. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1016. versionErrorDataPrepare(ctx, form)
  1017. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  1018. return
  1019. }
  1020. for _, parameter := range parameters.Parameter {
  1021. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1022. param = append(param, models.Parameter{
  1023. Label: parameter.Label,
  1024. Value: parameter.Value,
  1025. })
  1026. }
  1027. }
  1028. }
  1029. //save param config
  1030. if isSaveParam == "on" {
  1031. if form.ParameterTemplateName == "" {
  1032. log.Error("ParameterTemplateName is empty")
  1033. versionErrorDataPrepare(ctx, form)
  1034. ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
  1035. return
  1036. }
  1037. _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1038. ConfigName: form.ParameterTemplateName,
  1039. Description: form.PrameterDescription,
  1040. DataUrl: dataPath,
  1041. AppUrl: codeObsPath,
  1042. BootFileUrl: codeObsPath + bootFile,
  1043. TrainUrl: outputObsPath,
  1044. Flavor: models.Flavor{
  1045. Code: flavorCode,
  1046. },
  1047. WorkServerNum: workServerNumber,
  1048. EngineID: int64(engineID),
  1049. LogUrl: logObsPath,
  1050. PoolID: poolID,
  1051. Parameter: parameters.Parameter,
  1052. })
  1053. if err != nil {
  1054. log.Error("Failed to CreateTrainJobConfig: %v", err)
  1055. versionErrorDataPrepare(ctx, form)
  1056. ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1057. return
  1058. }
  1059. }
  1060. if err != nil {
  1061. log.Error("getFlavorNameByEngineID(%s) failed:%v", engineID, err.Error())
  1062. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1063. return
  1064. }
  1065. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  1066. if err != nil {
  1067. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  1068. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1069. return
  1070. }
  1071. req := &modelarts.GenerateTrainJobReq{
  1072. JobName: task.JobName,
  1073. DataUrl: dataPath,
  1074. Description: description,
  1075. CodeObsPath: codeObsPath,
  1076. BootFileUrl: codeObsPath + bootFile,
  1077. BootFile: bootFile,
  1078. TrainUrl: outputObsPath,
  1079. FlavorCode: flavorCode,
  1080. WorkServerNumber: workServerNumber,
  1081. IsLatestVersion: isLatestVersion,
  1082. EngineID: int64(engineID),
  1083. LogUrl: logObsPath,
  1084. PoolID: poolID,
  1085. Uuid: uuid,
  1086. Params: form.Params,
  1087. Parameters: parameters.Parameter,
  1088. PreVersionId: task.VersionID,
  1089. CommitID: commitID,
  1090. BranchName: branch_name,
  1091. FlavorName: FlavorName,
  1092. EngineName: EngineName,
  1093. PreVersionName: PreVersionName,
  1094. TotalVersionCount: latestTask.TotalVersionCount + 1,
  1095. }
  1096. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  1097. if err != nil {
  1098. log.Error("GenerateTrainJob failed:%v", err.Error())
  1099. versionErrorDataPrepare(ctx, form)
  1100. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1101. return
  1102. }
  1103. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  1104. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1105. }
  1106. // readDir reads the directory named by dirname and returns
  1107. // a list of directory entries sorted by filename.
  1108. func readDir(dirname string) ([]os.FileInfo, error) {
  1109. f, err := os.Open(dirname)
  1110. if err != nil {
  1111. return nil, err
  1112. }
  1113. list, err := f.Readdir(100)
  1114. f.Close()
  1115. if err != nil {
  1116. //todo: can not upload empty folder
  1117. if err == io.EOF {
  1118. return nil, nil
  1119. }
  1120. return nil, err
  1121. }
  1122. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  1123. return list, nil
  1124. }
  1125. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  1126. files, err := readDir(codePath)
  1127. if err != nil {
  1128. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  1129. return err
  1130. }
  1131. for _, file := range files {
  1132. if file.IsDir() {
  1133. input := &obs.PutObjectInput{}
  1134. input.Bucket = setting.Bucket
  1135. input.Key = parentDir + file.Name() + "/"
  1136. _, err = storage.ObsCli.PutObject(input)
  1137. if err != nil {
  1138. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1139. return err
  1140. }
  1141. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1142. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1143. return err
  1144. }
  1145. } else {
  1146. input := &obs.PutFileInput{}
  1147. input.Bucket = setting.Bucket
  1148. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1149. input.SourceFile = codePath + file.Name()
  1150. _, err = storage.ObsCli.PutFile(input)
  1151. if err != nil {
  1152. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1153. return err
  1154. }
  1155. }
  1156. }
  1157. return nil
  1158. }
  1159. func obsMkdir(dir string) error {
  1160. input := &obs.PutObjectInput{}
  1161. input.Bucket = setting.Bucket
  1162. input.Key = dir
  1163. _, err := storage.ObsCli.PutObject(input)
  1164. if err != nil {
  1165. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1166. return err
  1167. }
  1168. return nil
  1169. }
  1170. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1171. if !strings.HasSuffix(form.BootFile, ".py") {
  1172. log.Error("the boot file(%s) must be a python file", form.BootFile)
  1173. return errors.New("启动文件必须是python文件")
  1174. }
  1175. if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
  1176. log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
  1177. return errors.New("计算节点数必须在1-25之间")
  1178. }
  1179. if form.BranchName == "" {
  1180. log.Error("the branch must not be null!", form.BranchName)
  1181. return errors.New("代码分支不能为空!")
  1182. }
  1183. return nil
  1184. }
  1185. func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) error {
  1186. if !strings.HasSuffix(form.BootFile, ".py") {
  1187. log.Error("the boot file(%s) must be a python file", form.BootFile)
  1188. return errors.New("启动文件必须是python文件")
  1189. }
  1190. if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
  1191. log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
  1192. return errors.New("计算节点数必须在1-25之间")
  1193. }
  1194. if form.ModelName == "" {
  1195. log.Error("the ModelName(%d) must not be nil", form.ModelName)
  1196. return errors.New("模型名称不能为空")
  1197. }
  1198. if form.ModelVersion == "" {
  1199. log.Error("the ModelVersion(%d) must not be nil", form.ModelVersion)
  1200. return errors.New("模型版本不能为空")
  1201. }
  1202. if form.CkptName == "" {
  1203. log.Error("the CkptName(%d) must not be nil", form.CkptName)
  1204. return errors.New("权重文件不能为空")
  1205. }
  1206. if form.BranchName == "" {
  1207. log.Error("the Branch(%d) must not be nil", form.BranchName)
  1208. return errors.New("分支名不能为空")
  1209. }
  1210. if utf8.RuneCountInString(form.Description) > 255 {
  1211. log.Error("the Description length(%d) must not more than 255", form.Description)
  1212. return errors.New("描述字符不能超过255个字符")
  1213. }
  1214. return nil
  1215. }
  1216. func TrainJobShow(ctx *context.Context) {
  1217. ctx.Data["PageIsCloudBrain"] = true
  1218. var jobID = ctx.Params(":jobid")
  1219. repo := ctx.Repo.Repository
  1220. page := ctx.QueryInt("page")
  1221. if page <= 0 {
  1222. page = 1
  1223. }
  1224. var jobTypes []string
  1225. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1226. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1227. ListOptions: models.ListOptions{
  1228. Page: page,
  1229. PageSize: setting.UI.IssuePagingNum,
  1230. },
  1231. RepoID: repo.ID,
  1232. Type: models.TypeCloudBrainTwo,
  1233. JobTypes: jobTypes,
  1234. JobID: jobID,
  1235. })
  1236. if err != nil {
  1237. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1238. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1239. return
  1240. }
  1241. //设置权限
  1242. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1243. if err != nil {
  1244. ctx.ServerError("canNewJob failed", err)
  1245. return
  1246. }
  1247. ctx.Data["canNewJob"] = canNewJob
  1248. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1249. for i, task := range VersionListTasks {
  1250. var parameters models.Parameters
  1251. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1252. if err != nil {
  1253. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1254. trainJobNewDataPrepare(ctx)
  1255. return
  1256. }
  1257. if len(parameters.Parameter) > 0 {
  1258. paramTemp := ""
  1259. for _, Parameter := range parameters.Parameter {
  1260. param := Parameter.Label + " = " + Parameter.Value + "; "
  1261. paramTemp = paramTemp + param
  1262. }
  1263. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1264. } else {
  1265. VersionListTasks[i].Parameters = ""
  1266. }
  1267. VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1268. VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1269. }
  1270. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1271. pager.SetDefaultParams(ctx)
  1272. ctx.Data["Page"] = pager
  1273. ctx.Data["jobID"] = jobID
  1274. ctx.Data["jobName"] = VersionListTasks[0].JobName
  1275. ctx.Data["version_list_task"] = VersionListTasks
  1276. ctx.Data["version_list_count"] = VersionListCount
  1277. ctx.Data["canDownload"] = cloudbrain.CanDeleteJob(ctx, &VersionListTasks[0].Cloudbrain)
  1278. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1279. }
  1280. func TrainJobGetLog(ctx *context.Context) {
  1281. ctx.Data["PageIsTrainJob"] = true
  1282. var jobID = ctx.Params(":jobid")
  1283. var logFileName = ctx.Query("file_name")
  1284. var baseLine = ctx.Query("base_line")
  1285. var order = ctx.Query("order")
  1286. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1287. log.Error("order(%s) check failed", order)
  1288. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1289. return
  1290. }
  1291. task, err := models.GetCloudbrainByJobID(jobID)
  1292. if err != nil {
  1293. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1294. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1295. return
  1296. }
  1297. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1298. if err != nil {
  1299. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1300. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1301. return
  1302. }
  1303. ctx.Data["log"] = result
  1304. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1305. }
  1306. func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
  1307. task, err := models.GetCloudbrainByJobID(jobID)
  1308. if err != nil {
  1309. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1310. return nil, nil, err
  1311. }
  1312. resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
  1313. if err != nil {
  1314. log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
  1315. return nil, nil, err
  1316. }
  1317. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
  1318. if err != nil {
  1319. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1320. return nil, nil, err
  1321. }
  1322. return resultLogFile, result, err
  1323. }
  1324. func TrainJobDel(ctx *context.Context) {
  1325. var jobID = ctx.Params(":jobid")
  1326. repo := ctx.Repo.Repository
  1327. var jobTypes []string
  1328. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1329. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1330. RepoID: repo.ID,
  1331. Type: models.TypeCloudBrainTwo,
  1332. JobTypes: jobTypes,
  1333. JobID: jobID,
  1334. })
  1335. if err != nil {
  1336. ctx.ServerError("get VersionListTasks failed", err)
  1337. return
  1338. }
  1339. //删除modelarts上的任务记录
  1340. _, err = modelarts.DelTrainJob(jobID)
  1341. if err != nil {
  1342. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1343. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1344. return
  1345. }
  1346. //删除数据库Cloudbrain表的记录
  1347. for _, task := range VersionListTasks {
  1348. err = models.DeleteJob(&task.Cloudbrain)
  1349. if err != nil {
  1350. ctx.ServerError("DeleteJob failed", err)
  1351. return
  1352. }
  1353. }
  1354. //删除存储
  1355. if len(VersionListTasks) > 0 {
  1356. DeleteJobStorage(VersionListTasks[0].JobName)
  1357. }
  1358. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1359. }
  1360. func TrainJobStop(ctx *context.Context) {
  1361. var jobID = ctx.Params(":jobid")
  1362. task := ctx.Cloudbrain
  1363. _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1364. if err != nil {
  1365. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1366. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1367. return
  1368. }
  1369. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1370. }
  1371. func canUserCreateTrainJob(uid int64) (bool, error) {
  1372. org, err := models.GetOrgByName(setting.AllowedOrg)
  1373. if err != nil {
  1374. log.Error("get allowed org failed: ", setting.AllowedOrg)
  1375. return false, err
  1376. }
  1377. return org.IsOrgMember(uid)
  1378. }
  1379. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1380. if ctx == nil || ctx.User == nil {
  1381. log.Error("user unlogin!")
  1382. return false, nil
  1383. }
  1384. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1385. return true, nil
  1386. } else {
  1387. log.Error("Only user itself and admin can new trainjob!")
  1388. return false, nil
  1389. }
  1390. }
  1391. func TrainJobGetConfigList(ctx *context.Context) {
  1392. ctx.Data["PageIsTrainJob"] = true
  1393. var jobID = ctx.Params(":jobid")
  1394. var logFileName = ctx.Query("file_name")
  1395. var baseLine = ctx.Query("base_line")
  1396. var order = ctx.Query("order")
  1397. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1398. log.Error("order(%s) check failed", order)
  1399. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1400. return
  1401. }
  1402. task, err := models.GetCloudbrainByJobID(jobID)
  1403. if err != nil {
  1404. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1405. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1406. return
  1407. }
  1408. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1409. if err != nil {
  1410. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1411. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1412. return
  1413. }
  1414. ctx.Data["log"] = result
  1415. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1416. }
  1417. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1418. var result models.GetConfigListResult
  1419. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1420. if err != nil {
  1421. log.Error("GetConfigList failed:", err)
  1422. return &result, err
  1423. }
  1424. for _, config := range list.ParaConfigs {
  1425. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1426. if err != nil {
  1427. log.Error("GetParaConfig failed:", err)
  1428. return &result, err
  1429. }
  1430. config.Result = paraConfig
  1431. }
  1432. return list, nil
  1433. }
  1434. func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) {
  1435. ctx.Data["PageIsTrainJob"] = true
  1436. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  1437. jobName := form.JobName
  1438. uuid := form.Attachment
  1439. description := form.Description
  1440. workServerNumber := form.WorkServerNumber
  1441. engineID := form.EngineID
  1442. bootFile := form.BootFile
  1443. flavorCode := form.Flavor
  1444. params := form.Params
  1445. poolID := form.PoolID
  1446. repo := ctx.Repo.Repository
  1447. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1448. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  1449. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  1450. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1451. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1452. branch_name := form.BranchName
  1453. FlavorName := form.FlavorName
  1454. EngineName := form.EngineName
  1455. LabelName := form.LabelName
  1456. isLatestVersion := modelarts.IsLatestVersion
  1457. VersionCount := modelarts.VersionCount
  1458. trainUrl := form.TrainUrl
  1459. modelName := form.ModelName
  1460. modelVersion := form.ModelVersion
  1461. ckptName := form.CkptName
  1462. ckptUrl := form.TrainUrl + form.CkptName
  1463. if err := paramCheckCreateInferenceJob(form); err != nil {
  1464. log.Error("paramCheckCreateInferenceJob failed:(%v)", err)
  1465. inferenceJobErrorNewDataPrepare(ctx, form)
  1466. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1467. return
  1468. }
  1469. count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID)
  1470. if err != nil {
  1471. log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1472. inferenceJobErrorNewDataPrepare(ctx, form)
  1473. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1474. return
  1475. } else {
  1476. if count >= 1 {
  1477. log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])
  1478. inferenceJobErrorNewDataPrepare(ctx, form)
  1479. ctx.RenderWithErr("you have already a running or waiting inference task, can not create more", tplModelArtsInferenceJobNew, &form)
  1480. return
  1481. }
  1482. }
  1483. //todo: del the codeLocalPath
  1484. _, err = ioutil.ReadDir(codeLocalPath)
  1485. if err == nil {
  1486. os.RemoveAll(codeLocalPath)
  1487. }
  1488. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1489. commitID, _ := gitRepo.GetBranchCommitID(branch_name)
  1490. if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
  1491. Branch: branch_name,
  1492. }); err != nil {
  1493. log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err)
  1494. inferenceJobErrorNewDataPrepare(ctx, form)
  1495. ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsInferenceJobNew, &form)
  1496. return
  1497. }
  1498. //todo: upload code (send to file_server todo this work?)
  1499. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  1500. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  1501. inferenceJobErrorNewDataPrepare(ctx, form)
  1502. ctx.RenderWithErr("Failed to obsMkdir_result", tplModelArtsInferenceJobNew, &form)
  1503. return
  1504. }
  1505. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1506. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1507. inferenceJobErrorNewDataPrepare(ctx, form)
  1508. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsInferenceJobNew, &form)
  1509. return
  1510. }
  1511. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1512. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1513. inferenceJobErrorNewDataPrepare(ctx, form)
  1514. ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsInferenceJobNew, &form)
  1515. return
  1516. }
  1517. //todo: del local code?
  1518. var parameters models.Parameters
  1519. param := make([]models.Parameter, 0)
  1520. param = append(param, models.Parameter{
  1521. Label: modelarts.ResultUrl,
  1522. Value: "s3:/" + resultObsPath,
  1523. }, models.Parameter{
  1524. Label: modelarts.CkptUrl,
  1525. Value: "s3:/" + ckptUrl,
  1526. })
  1527. if len(params) != 0 {
  1528. err := json.Unmarshal([]byte(params), &parameters)
  1529. if err != nil {
  1530. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1531. inferenceJobErrorNewDataPrepare(ctx, form)
  1532. ctx.RenderWithErr("运行参数错误", tplModelArtsInferenceJobNew, &form)
  1533. return
  1534. }
  1535. for _, parameter := range parameters.Parameter {
  1536. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1537. param = append(param, models.Parameter{
  1538. Label: parameter.Label,
  1539. Value: parameter.Value,
  1540. })
  1541. }
  1542. }
  1543. }
  1544. req := &modelarts.GenerateInferenceJobReq{
  1545. JobName: jobName,
  1546. DataUrl: dataPath,
  1547. Description: description,
  1548. CodeObsPath: codeObsPath,
  1549. BootFileUrl: codeObsPath + bootFile,
  1550. BootFile: bootFile,
  1551. TrainUrl: trainUrl,
  1552. FlavorCode: flavorCode,
  1553. WorkServerNumber: workServerNumber,
  1554. EngineID: int64(engineID),
  1555. LogUrl: logObsPath,
  1556. PoolID: poolID,
  1557. Uuid: uuid,
  1558. Parameters: param, //modelarts训练时用到
  1559. CommitID: commitID,
  1560. BranchName: branch_name,
  1561. Params: form.Params,
  1562. FlavorName: FlavorName,
  1563. EngineName: EngineName,
  1564. LabelName: LabelName,
  1565. IsLatestVersion: isLatestVersion,
  1566. VersionCount: VersionCount,
  1567. TotalVersionCount: modelarts.TotalVersionCount,
  1568. ModelName: modelName,
  1569. ModelVersion: modelVersion,
  1570. CkptName: ckptName,
  1571. ResultUrl: resultObsPath,
  1572. }
  1573. //将params转换Parameters.Parameter,出错时返回给前端
  1574. // var Parameters modelarts.Parameters
  1575. // if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  1576. // ctx.ServerError("json.Unmarshal failed:", err)
  1577. // return
  1578. // }
  1579. err = modelarts.GenerateInferenceJob(ctx, req)
  1580. if err != nil {
  1581. log.Error("GenerateTrainJob failed:%v", err.Error())
  1582. inferenceJobErrorNewDataPrepare(ctx, form)
  1583. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1584. return
  1585. }
  1586. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
  1587. }
  1588. func InferenceJobIndex(ctx *context.Context) {
  1589. MustEnableModelArts(ctx)
  1590. repo := ctx.Repo.Repository
  1591. page := ctx.QueryInt("page")
  1592. if page <= 0 {
  1593. page = 1
  1594. }
  1595. var jobTypes []string
  1596. jobTypes = append(jobTypes, string(models.JobTypeInference))
  1597. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  1598. ListOptions: models.ListOptions{
  1599. Page: page,
  1600. PageSize: setting.UI.IssuePagingNum,
  1601. },
  1602. RepoID: repo.ID,
  1603. Type: models.TypeCloudBrainTwo,
  1604. JobTypes: jobTypes,
  1605. })
  1606. if err != nil {
  1607. ctx.ServerError("Cloudbrain", err)
  1608. return
  1609. }
  1610. for i, task := range tasks {
  1611. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1612. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1613. tasks[i].ComputeResource = models.NPUResource
  1614. }
  1615. repoId := ctx.Repo.Repository.ID
  1616. Type := -1
  1617. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  1618. ListOptions: models.ListOptions{
  1619. Page: 1,
  1620. PageSize: 2,
  1621. },
  1622. RepoID: repoId,
  1623. Type: Type,
  1624. New: MODEL_LATEST,
  1625. })
  1626. ctx.Data["MODEL_COUNT"] = model_count
  1627. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  1628. pager.SetDefaultParams(ctx)
  1629. ctx.Data["Page"] = pager
  1630. ctx.Data["PageIsCloudBrain"] = true
  1631. ctx.Data["Tasks"] = tasks
  1632. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  1633. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  1634. ctx.HTML(200, tplModelArtsInferenceJobIndex)
  1635. }
  1636. func InferenceJobNew(ctx *context.Context) {
  1637. err := inferenceJobNewDataPrepare(ctx)
  1638. if err != nil {
  1639. ctx.ServerError("get new inference-job info failed", err)
  1640. return
  1641. }
  1642. ctx.HTML(200, tplModelArtsInferenceJobNew)
  1643. }
  1644. func inferenceJobNewDataPrepare(ctx *context.Context) error {
  1645. ctx.Data["PageIsCloudBrain"] = true
  1646. t := time.Now()
  1647. var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  1648. ctx.Data["job_name"] = jobName
  1649. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  1650. if err != nil {
  1651. ctx.ServerError("GetAllUserAttachments failed:", err)
  1652. return err
  1653. }
  1654. ctx.Data["attachments"] = attachs
  1655. var resourcePools modelarts.ResourcePool
  1656. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  1657. ctx.ServerError("json.Unmarshal failed:", err)
  1658. return err
  1659. }
  1660. ctx.Data["resource_pools"] = resourcePools.Info
  1661. var engines modelarts.Engine
  1662. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  1663. ctx.ServerError("json.Unmarshal failed:", err)
  1664. return err
  1665. }
  1666. ctx.Data["engines"] = engines.Info
  1667. var versionInfos modelarts.VersionInfo
  1668. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1669. ctx.ServerError("json.Unmarshal failed:", err)
  1670. return err
  1671. }
  1672. ctx.Data["engine_versions"] = versionInfos.Version
  1673. var flavorInfos modelarts.Flavor
  1674. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  1675. ctx.ServerError("json.Unmarshal failed:", err)
  1676. return err
  1677. }
  1678. ctx.Data["flavor_infos"] = flavorInfos.Info
  1679. ctx.Data["params"] = ""
  1680. ctx.Data["branchName"] = ctx.Repo.BranchName
  1681. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  1682. if err != nil {
  1683. ctx.ServerError("getConfigList failed:", err)
  1684. return err
  1685. }
  1686. ctx.Data["config_list"] = configList.ParaConfigs
  1687. repoId := ctx.Repo.Repository.ID
  1688. Type := -1
  1689. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  1690. ListOptions: models.ListOptions{
  1691. Page: 1,
  1692. PageSize: 2,
  1693. },
  1694. RepoID: repoId,
  1695. Type: Type,
  1696. New: MODEL_LATEST,
  1697. })
  1698. ctx.Data["MODEL_COUNT"] = model_count
  1699. return nil
  1700. }
  1701. func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
  1702. ctx.Data["PageIsCloudBrain"] = true
  1703. t := time.Now()
  1704. var jobName = "inference" + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  1705. ctx.Data["job_name"] = jobName
  1706. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  1707. if err != nil {
  1708. ctx.ServerError("GetAllUserAttachments failed:", err)
  1709. return err
  1710. }
  1711. ctx.Data["attachments"] = attachs
  1712. var resourcePools modelarts.ResourcePool
  1713. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  1714. ctx.ServerError("json.Unmarshal failed:", err)
  1715. return err
  1716. }
  1717. ctx.Data["resource_pools"] = resourcePools.Info
  1718. var engines modelarts.Engine
  1719. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  1720. ctx.ServerError("json.Unmarshal failed:", err)
  1721. return err
  1722. }
  1723. ctx.Data["engines"] = engines.Info
  1724. var versionInfos modelarts.VersionInfo
  1725. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1726. ctx.ServerError("json.Unmarshal failed:", err)
  1727. return err
  1728. }
  1729. ctx.Data["engine_versions"] = versionInfos.Version
  1730. var flavorInfos modelarts.Flavor
  1731. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  1732. ctx.ServerError("json.Unmarshal failed:", err)
  1733. return err
  1734. }
  1735. ctx.Data["flavor_infos"] = flavorInfos.Info
  1736. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  1737. if err != nil {
  1738. ctx.ServerError("getConfigList failed:", err)
  1739. return err
  1740. }
  1741. var Parameters modelarts.Parameters
  1742. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  1743. ctx.ServerError("json.Unmarshal failed:", err)
  1744. return err
  1745. }
  1746. ctx.Data["params"] = Parameters.Parameter
  1747. ctx.Data["config_list"] = configList.ParaConfigs
  1748. ctx.Data["bootFile"] = form.BootFile
  1749. ctx.Data["uuid"] = form.Attachment
  1750. ctx.Data["branch_name"] = form.BranchName
  1751. ctx.Data["model_name"] = form.ModelName
  1752. ctx.Data["model_version"] = form.ModelVersion
  1753. ctx.Data["ckpt_name"] = form.CkptName
  1754. ctx.Data["train_url"] = form.TrainUrl
  1755. return nil
  1756. }
  1757. func InferenceJobShow(ctx *context.Context) {
  1758. ctx.Data["PageIsCloudBrain"] = true
  1759. var jobID = ctx.Params(":jobid")
  1760. page := ctx.QueryInt("page")
  1761. if page <= 0 {
  1762. page = 1
  1763. }
  1764. task, err := models.GetCloudbrainByJobID(jobID)
  1765. if err != nil {
  1766. log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
  1767. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobShow, nil)
  1768. return
  1769. }
  1770. //设置权限
  1771. canNewJob, err := canUserCreateTrainJobVersion(ctx, task.UserID)
  1772. if err != nil {
  1773. ctx.ServerError("canNewJob failed", err)
  1774. return
  1775. }
  1776. ctx.Data["canNewJob"] = canNewJob
  1777. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1778. var parameters models.Parameters
  1779. err = json.Unmarshal([]byte(task.Parameters), &parameters)
  1780. if err != nil {
  1781. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1782. trainJobNewDataPrepare(ctx)
  1783. return
  1784. }
  1785. if len(parameters.Parameter) > 0 {
  1786. paramTemp := ""
  1787. for _, Parameter := range parameters.Parameter {
  1788. param := Parameter.Label + " = " + Parameter.Value + "; "
  1789. paramTemp = paramTemp + param
  1790. }
  1791. task.Parameters = paramTemp[:len(paramTemp)-2]
  1792. } else {
  1793. task.Parameters = ""
  1794. }
  1795. LabelName := strings.Fields(task.LabelName)
  1796. ctx.Data["labelName"] = LabelName
  1797. ctx.Data["jobID"] = jobID
  1798. ctx.Data["jobName"] = task.JobName
  1799. ctx.Data["task"] = task
  1800. ctx.Data["canDownload"] = cloudbrain.CanDeleteJob(ctx, task)
  1801. tempUids := []int64{}
  1802. tempUids = append(tempUids, task.UserID)
  1803. JobCreater, err := models.GetUserNamesByIDs(tempUids)
  1804. if err != nil {
  1805. log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
  1806. }
  1807. ctx.Data["userName"] = JobCreater[0]
  1808. ctx.HTML(http.StatusOK, tplModelArtsInferenceJobShow)
  1809. }
  1810. func ModelDownload(ctx *context.Context) {
  1811. var (
  1812. err error
  1813. )
  1814. var jobID = ctx.Params(":jobid")
  1815. versionName := ctx.Query("version_name")
  1816. parentDir := ctx.Query("parent_dir")
  1817. fileName := ctx.Query("file_name")
  1818. log.Info("DownloadSingleModelFile start.")
  1819. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  1820. if err != nil {
  1821. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1822. return
  1823. }
  1824. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  1825. log.Info("Download path is:%s", path)
  1826. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  1827. if err != nil {
  1828. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  1829. ctx.ServerError("GetObsCreateSignedUrl", err)
  1830. return
  1831. }
  1832. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  1833. }
  1834. func ResultDownload(ctx *context.Context) {
  1835. var (
  1836. err error
  1837. )
  1838. var jobID = ctx.Params(":jobid")
  1839. versionName := ctx.Query("version_name")
  1840. parentDir := ctx.Query("parent_dir")
  1841. fileName := ctx.Query("file_name")
  1842. log.Info("DownloadResult start.")
  1843. task, err := models.GetCloudbrainByJobID(jobID)
  1844. if err != nil {
  1845. ctx.Data["error"] = err.Error()
  1846. }
  1847. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
  1848. log.Info("Download path is:%s", path)
  1849. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  1850. if err != nil {
  1851. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  1852. ctx.ServerError("GetObsCreateSignedUrl", err)
  1853. return
  1854. }
  1855. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  1856. }
  1857. func DeleteJobStorage(jobName string) error {
  1858. //delete local
  1859. localJobPath := setting.JobPath + jobName
  1860. err := os.RemoveAll(localJobPath)
  1861. if err != nil {
  1862. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  1863. }
  1864. //delete oss
  1865. dirPath := setting.CodePathPrefix + jobName + "/"
  1866. err = storage.ObsRemoveObject(setting.Bucket, dirPath)
  1867. if err != nil {
  1868. log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
  1869. }
  1870. return nil
  1871. }
  1872. func DownloadMultiResultFile(ctx *context.Context) {
  1873. var jobID = ctx.Params(":jobid")
  1874. var versionName = ctx.Query("version_name")
  1875. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  1876. if err != nil {
  1877. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  1878. return
  1879. }
  1880. // if !isCanDeleteOrDownload(ctx, task) {
  1881. // ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
  1882. // return
  1883. // }
  1884. // path := Model_prefix + models.AttachmentRelativePath(id) + "/"
  1885. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"
  1886. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  1887. if err == nil {
  1888. //count++
  1889. // models.ModifyModelDownloadCount(id)
  1890. returnFileName := task.JobName + ".zip"
  1891. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+returnFileName)
  1892. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1893. w := zip.NewWriter(ctx.Resp)
  1894. defer w.Close()
  1895. for _, oneFile := range allFile {
  1896. if oneFile.IsDir {
  1897. log.Info("zip dir name:" + oneFile.FileName)
  1898. } else {
  1899. log.Info("zip file name:" + oneFile.FileName)
  1900. fDest, err := w.Create(oneFile.FileName)
  1901. if err != nil {
  1902. log.Info("create zip entry error, download file failed: %s\n", err.Error())
  1903. ctx.ServerError("download file failed:", err)
  1904. return
  1905. }
  1906. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  1907. if err != nil {
  1908. log.Info("download file failed: %s\n", err.Error())
  1909. ctx.ServerError("download file failed:", err)
  1910. return
  1911. } else {
  1912. defer body.Close()
  1913. p := make([]byte, 1024)
  1914. var readErr error
  1915. var readCount int
  1916. // 读取对象内容
  1917. for {
  1918. readCount, readErr = body.Read(p)
  1919. if readCount > 0 {
  1920. fDest.Write(p[:readCount])
  1921. }
  1922. if readErr != nil {
  1923. break
  1924. }
  1925. }
  1926. }
  1927. }
  1928. }
  1929. } else {
  1930. log.Info("error,msg=" + err.Error())
  1931. ctx.ServerError("no file to download.", err)
  1932. }
  1933. }
  1934. func SetJobCount(ctx *context.Context) {
  1935. repoId := ctx.Repo.Repository.ID
  1936. _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{
  1937. RepoID: repoId,
  1938. Type: modelarts.DebugType,
  1939. })
  1940. if err != nil {
  1941. ctx.ServerError("Get job faild:", err)
  1942. return
  1943. }
  1944. ctx.Data["jobCount"] = jobCount
  1945. }