You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 97 kB

4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package repo
  2. import (
  3. "archive/zip"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "path"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "unicode/utf8"
  16. "code.gitea.io/gitea/modules/modelarts_cd"
  17. "code.gitea.io/gitea/services/cloudbrain/resource"
  18. "code.gitea.io/gitea/services/reward/point/account"
  19. "code.gitea.io/gitea/models"
  20. "code.gitea.io/gitea/modules/auth"
  21. "code.gitea.io/gitea/modules/base"
  22. "code.gitea.io/gitea/modules/cloudbrain"
  23. "code.gitea.io/gitea/modules/context"
  24. "code.gitea.io/gitea/modules/git"
  25. "code.gitea.io/gitea/modules/log"
  26. "code.gitea.io/gitea/modules/modelarts"
  27. "code.gitea.io/gitea/modules/notification"
  28. "code.gitea.io/gitea/modules/obs"
  29. "code.gitea.io/gitea/modules/redis/redis_key"
  30. "code.gitea.io/gitea/modules/redis/redis_lock"
  31. "code.gitea.io/gitea/modules/setting"
  32. "code.gitea.io/gitea/modules/storage"
  33. "code.gitea.io/gitea/modules/timeutil"
  34. "code.gitea.io/gitea/modules/util"
  35. )
  36. const (
  37. tplDebugJobIndex base.TplName = "repo/debugjob/index"
  38. tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
  39. tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
  40. tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
  41. tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
  42. tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
  43. tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
  44. tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
  45. tplModelArtsInferenceJobIndex base.TplName = "repo/modelarts/inferencejob/index"
  46. tplModelArtsInferenceJobNew base.TplName = "repo/modelarts/inferencejob/new"
  47. tplModelArtsInferenceJobShow base.TplName = "repo/modelarts/inferencejob/show"
  48. )
  49. func DebugJobIndex(ctx *context.Context) {
  50. listType := ctx.Query("debugListType")
  51. if listType == "" {
  52. listType = models.AllResource
  53. }
  54. ctx.Data["ListType"] = listType
  55. MustEnableCloudbrain(ctx)
  56. repo := ctx.Repo.Repository
  57. page := ctx.QueryInt("page")
  58. if page <= 0 {
  59. page = 1
  60. }
  61. jobTypeNot := false
  62. var computeResource string
  63. if listType != models.AllResource {
  64. computeResource = listType
  65. }
  66. var jobTypes []string
  67. jobTypes = append(jobTypes, string(models.JobTypeDebug))
  68. ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  69. ListOptions: models.ListOptions{
  70. Page: page,
  71. PageSize: setting.UI.IssuePagingNum,
  72. },
  73. RepoID: repo.ID,
  74. ComputeResource: computeResource,
  75. Type: models.TypeCloudBrainAll,
  76. JobTypeNot: jobTypeNot,
  77. JobTypes: jobTypes,
  78. })
  79. if err != nil {
  80. ctx.ServerError("Get debugjob faild:", err)
  81. return
  82. }
  83. for i, task := range ciTasks {
  84. ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  85. ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  86. ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
  87. }
  88. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  89. pager.AddParam(ctx, "debugListType", "ListType")
  90. ctx.Data["Page"] = pager
  91. ctx.Data["PageIsCloudBrain"] = true
  92. ctx.Data["Tasks"] = ciTasks
  93. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  94. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  95. ctx.Data["debugListType"] = listType
  96. ctx.HTML(200, tplDebugJobIndex)
  97. }
  98. // MustEnableDataset check if repository enable internal cb
  99. func MustEnableModelArts(ctx *context.Context) {
  100. if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
  101. ctx.NotFound("MustEnableCloudbrain", nil)
  102. return
  103. }
  104. }
  105. func NotebookNew(ctx *context.Context) {
  106. notebookNewDataPrepare(ctx)
  107. ctx.HTML(200, tplModelArtsNotebookNew)
  108. }
  109. func notebookNewDataPrepare(ctx *context.Context) error {
  110. ctx.Data["PageIsCloudBrain"] = true
  111. t := time.Now()
  112. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  113. ctx.Data["display_job_name"] = displayJobName
  114. attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
  115. if err != nil {
  116. ctx.ServerError("GetAllUserAttachments failed:", err)
  117. return err
  118. }
  119. ctx.Data["attachments"] = attachs
  120. ctx.Data["images"] = setting.StImageInfos.ImageInfo
  121. prepareCloudbrainTwoDebugSpecs(ctx)
  122. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  123. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  124. ctx.Data["WaitCount"] = waitCount
  125. return nil
  126. }
  127. func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) {
  128. aiCenterCode := models.AICenterOfCloudBrainTwo
  129. if setting.ModelartsCD.Enabled {
  130. aiCenterCode = models.AICenterOfChengdu
  131. }
  132. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  133. JobType: models.JobTypeDebug,
  134. ComputeResource: models.NPU,
  135. Cluster: models.OpenICluster,
  136. AiCenterCode: aiCenterCode,
  137. })
  138. ctx.Data["Specs"] = noteBookSpecs
  139. }
  140. func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  141. ctx.Data["PageIsNotebook"] = true
  142. jobName := form.JobName
  143. uuid := form.Attachment
  144. description := form.Description
  145. flavor := form.Flavor
  146. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  147. if err != nil {
  148. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  149. cloudBrainNewDataPrepare(ctx)
  150. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  151. return
  152. } else {
  153. if count >= 1 {
  154. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  155. cloudBrainNewDataPrepare(ctx)
  156. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  157. return
  158. }
  159. }
  160. _, err = models.GetCloudbrainByName(jobName)
  161. if err == nil {
  162. log.Error("the job name did already exist", ctx.Data["MsgID"])
  163. cloudBrainNewDataPrepare(ctx)
  164. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  165. return
  166. } else {
  167. if !models.IsErrJobNotExist(err) {
  168. log.Error("system error, %v", err, ctx.Data["MsgID"])
  169. cloudBrainNewDataPrepare(ctx)
  170. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  171. return
  172. }
  173. }
  174. err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
  175. if err != nil {
  176. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  177. return
  178. }
  179. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  180. }
  181. func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
  182. ctx.Data["PageIsNotebook"] = true
  183. displayJobName := form.DisplayJobName
  184. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  185. uuid := form.Attachment
  186. description := form.Description
  187. imageId := form.ImageId
  188. repo := ctx.Repo.Repository
  189. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName))
  190. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  191. if !isOk {
  192. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  193. notebookNewDataPrepare(ctx)
  194. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsNotebookNew, &form)
  195. return
  196. }
  197. defer lock.UnLock()
  198. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  199. if err != nil {
  200. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  201. notebookNewDataPrepare(ctx)
  202. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  203. return
  204. } else {
  205. if count >= 1 {
  206. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  207. notebookNewDataPrepare(ctx)
  208. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
  209. return
  210. }
  211. }
  212. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  213. if err == nil {
  214. if len(tasks) != 0 {
  215. log.Error("the job name did already exist", ctx.Data["MsgID"])
  216. notebookNewDataPrepare(ctx)
  217. ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
  218. return
  219. }
  220. } else {
  221. if !models.IsErrJobNotExist(err) {
  222. log.Error("system error, %v", err, ctx.Data["MsgID"])
  223. notebookNewDataPrepare(ctx)
  224. ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
  225. return
  226. }
  227. }
  228. var aiCenterCode = models.AICenterOfCloudBrainTwo
  229. if setting.ModelartsCD.Enabled {
  230. aiCenterCode = models.AICenterOfChengdu
  231. }
  232. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  233. JobType: models.JobTypeDebug,
  234. ComputeResource: models.NPU,
  235. Cluster: models.OpenICluster,
  236. AiCenterCode: aiCenterCode})
  237. if err != nil || spec == nil {
  238. notebookNewDataPrepare(ctx)
  239. ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form)
  240. return
  241. }
  242. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  243. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  244. cloudBrainNewDataPrepare(ctx)
  245. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsNotebookNew, &form)
  246. return
  247. }
  248. if setting.ModelartsCD.Enabled {
  249. err = modelarts_cd.GenerateNotebook(ctx, displayJobName, jobName, uuid, description, imageId, spec)
  250. } else {
  251. err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec)
  252. }
  253. if err != nil {
  254. log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
  255. notebookNewDataPrepare(ctx)
  256. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
  257. return
  258. }
  259. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  260. }
  261. func NotebookShow(ctx *context.Context) {
  262. ctx.Data["PageIsCloudBrain"] = true
  263. debugListType := ctx.Query("debugListType")
  264. if debugListType == "" {
  265. debugListType = "all"
  266. }
  267. var ID = ctx.Params(":id")
  268. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  269. if err != nil {
  270. log.Error("GET job error", err.Error())
  271. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  272. return
  273. }
  274. if task.DeletedAt.IsZero() { //normal record
  275. err := modelarts.HandleNotebookInfo(task)
  276. if err != nil {
  277. ctx.Data["error"] = err.Error()
  278. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
  279. return
  280. }
  281. } else { //deleted record
  282. }
  283. datasetDownload := make([]models.DatasetDownload, 0)
  284. if ctx.IsSigned {
  285. if task.Uuid != "" && task.UserID == ctx.User.ID {
  286. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, true)
  287. }
  288. }
  289. user, err := models.GetUserByID(task.UserID)
  290. if err == nil {
  291. task.User = user
  292. }
  293. prepareSpec4Show(ctx, task)
  294. if task.TrainJobDuration == "" {
  295. if task.Duration == 0 {
  296. var duration int64
  297. if task.Status == string(models.JobRunning) {
  298. duration = time.Now().Unix() - int64(task.CreatedUnix)
  299. } else {
  300. duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
  301. }
  302. task.Duration = duration
  303. }
  304. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  305. }
  306. ctx.Data["duration"] = task.TrainJobDuration
  307. ctx.Data["datasetDownload"] = datasetDownload
  308. ctx.Data["task"] = task
  309. ctx.Data["ID"] = ID
  310. ctx.Data["jobName"] = task.JobName
  311. ctx.Data["debugListType"] = debugListType
  312. ctx.HTML(200, tplModelArtsNotebookShow)
  313. }
  314. func GetCloudBrainDataSetInfo(uuid string, datasetname string, isNeedDown bool) []models.DatasetDownload {
  315. datasetDownload := make([]models.DatasetDownload, 0)
  316. if len(uuid) == 0 {
  317. return datasetDownload
  318. }
  319. uuidList := strings.Split(uuid, ";")
  320. datasetnameList := strings.Split(datasetname, ";")
  321. for i, uuidStr := range uuidList {
  322. name := ""
  323. link := ""
  324. url := ""
  325. isDelete := false
  326. attachment, err := models.GetAttachmentByUUID(uuidStr)
  327. if err != nil {
  328. log.Error("GetAttachmentByUUID failed:%v", err.Error())
  329. if len(datasetnameList) <= i || len(datasetname) == 0 {
  330. continue
  331. }
  332. name = datasetnameList[i]
  333. isDelete = true
  334. } else {
  335. name = attachment.Name
  336. dataset, err := models.GetDatasetByID(attachment.DatasetID)
  337. if err != nil {
  338. log.Error("GetDatasetByID failed:%v", err.Error())
  339. } else {
  340. repo, err := models.GetRepositoryByID(dataset.RepoID)
  341. if err != nil {
  342. log.Error("GetRepositoryByID failed:%v", err.Error())
  343. } else {
  344. link = repo.Link() + "/datasets"
  345. }
  346. }
  347. if isNeedDown {
  348. url = attachment.S3DownloadURL()
  349. }
  350. }
  351. datasetDownload = append(datasetDownload, models.DatasetDownload{
  352. DatasetName: name,
  353. DatasetDownloadLink: url,
  354. RepositoryLink: link,
  355. IsDelete: isDelete,
  356. })
  357. }
  358. log.Info("dataset length=" + fmt.Sprint(len(datasetDownload)))
  359. return datasetDownload
  360. }
  361. func setShowSpecBySpecialPoolConfig(ctx *context.Context, findSpec bool, task *models.Cloudbrain) {
  362. modelarts.InitSpecialPool()
  363. if modelarts.SpecialPools != nil && !findSpec {
  364. for _, pool := range modelarts.SpecialPools.Pools {
  365. for _, flavor := range pool.Flavor {
  366. if flavor.Value == task.FlavorCode {
  367. ctx.Data["resource_spec"] = flavor.Desc
  368. }
  369. }
  370. }
  371. }
  372. }
  373. func NotebookDebug2(ctx *context.Context) {
  374. var err error
  375. var result *models.GetNotebook2Result
  376. task := ctx.Cloudbrain
  377. if task.Type == models.TypeCloudBrainTwo {
  378. result, err = modelarts.GetNotebook2(task.JobID)
  379. } else if task.Type == models.TypeCDCenter {
  380. result, err = modelarts_cd.GetNotebook(task.JobID)
  381. }
  382. if err != nil {
  383. ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
  384. return
  385. }
  386. ctx.Redirect(result.Url + "?token=" + result.Token)
  387. }
  388. func NotebookRestart(ctx *context.Context) {
  389. var id = ctx.Params(":id")
  390. var resultCode = "-1"
  391. var errorMsg = ""
  392. var status = ""
  393. var spec *models.Specification
  394. task := ctx.Cloudbrain
  395. for {
  396. ctx.CheckWechatBind()
  397. if ctx.Written() {
  398. return
  399. }
  400. if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
  401. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  402. errorMsg = "the job is not stopped"
  403. break
  404. }
  405. count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
  406. if err != nil {
  407. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  408. errorMsg = "system error"
  409. break
  410. } else {
  411. if count >= 1 {
  412. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  413. errorMsg = "you have already a running or waiting task, can not create more"
  414. break
  415. }
  416. }
  417. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  418. if err != nil || oldSpec == nil {
  419. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  420. errorMsg = "Resource specification not available"
  421. break
  422. }
  423. aiCenterCode := models.AICenterOfCloudBrainTwo
  424. if task.Type == models.TypeCDCenter {
  425. aiCenterCode = models.AICenterOfChengdu
  426. }
  427. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  428. JobType: models.JobType(task.JobType),
  429. ComputeResource: models.NPU,
  430. Cluster: models.OpenICluster,
  431. AiCenterCode: aiCenterCode})
  432. if err != nil || spec == nil {
  433. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  434. errorMsg = "Resource specification not support any more"
  435. break
  436. }
  437. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  438. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  439. errorMsg = ctx.Tr("points.insufficient_points_balance")
  440. break
  441. }
  442. createTime := timeutil.TimeStampNow()
  443. param := models.NotebookAction{
  444. Action: models.ActionStart,
  445. }
  446. var res *models.NotebookActionResult
  447. if task.Type == models.TypeCloudBrainTwo {
  448. res, err = modelarts.ManageNotebook2(task.JobID, param)
  449. } else if task.Type == models.TypeCDCenter {
  450. res, err = modelarts_cd.ManageNotebook(task.JobID, param)
  451. }
  452. if err != nil {
  453. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  454. /* 暂不处理再次调试502的场景,详情见方案
  455. if strings.HasPrefix(err.Error(), modelarts.UnknownErrorPrefix) {
  456. log.Info("(%s)unknown error, set temp status", task.DisplayJobName)
  457. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  458. JobID: task.JobID,
  459. VersionID: models.TempVersionId,
  460. Status: models.TempJobStatus,
  461. Type: task.Type,
  462. JobName: task.JobName,
  463. JobType: task.JobType,
  464. })
  465. if errTemp != nil {
  466. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  467. }
  468. }
  469. */
  470. errorMsg = err.Error()
  471. break
  472. }
  473. newTask := &models.Cloudbrain{
  474. Status: res.Status,
  475. UserID: task.UserID,
  476. RepoID: task.RepoID,
  477. JobID: task.JobID,
  478. JobName: task.JobName,
  479. DisplayJobName: task.DisplayJobName,
  480. JobType: task.JobType,
  481. Type: task.Type,
  482. Uuid: task.Uuid,
  483. Image: task.Image,
  484. ComputeResource: task.ComputeResource,
  485. Description: task.Description,
  486. CreatedUnix: createTime,
  487. UpdatedUnix: createTime,
  488. Spec: spec,
  489. }
  490. err = models.RestartCloudbrain(task, newTask)
  491. if err != nil {
  492. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  493. errorMsg = "system error"
  494. break
  495. }
  496. id = strconv.FormatInt(newTask.ID, 10)
  497. status = res.Status
  498. resultCode = "0"
  499. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, models.ActionCreateDebugNPUTask)
  500. break
  501. }
  502. ctx.JSON(200, map[string]string{
  503. "result_code": resultCode,
  504. "error_msg": errorMsg,
  505. "status": status,
  506. "id": id,
  507. })
  508. }
  509. func NotebookStop(ctx *context.Context) {
  510. var id = ctx.Params(":id")
  511. var resultCode = "0"
  512. var errorMsg = ""
  513. var status = ""
  514. task := ctx.Cloudbrain
  515. for {
  516. if task.Status != string(models.ModelArtsRunning) {
  517. log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
  518. resultCode = "-1"
  519. errorMsg = "the job is not running"
  520. break
  521. }
  522. param := models.NotebookAction{
  523. Action: models.ActionStop,
  524. }
  525. var err error
  526. var res *models.NotebookActionResult
  527. if task.Type == models.TypeCloudBrainTwo {
  528. res, err = modelarts.ManageNotebook2(task.JobID, param)
  529. } else if task.Type == models.TypeCDCenter {
  530. res, err = modelarts_cd.ManageNotebook(task.JobID, param)
  531. }
  532. if err != nil {
  533. log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  534. resultCode = "-1"
  535. errorMsg = err.Error()
  536. if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
  537. errorMsg = "the job's version is too old and can not be restarted"
  538. }
  539. break
  540. }
  541. status = res.Status
  542. oldStatus := task.Status
  543. task.Status = res.Status
  544. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  545. task.EndTime = timeutil.TimeStampNow()
  546. }
  547. task.ComputeAndSetDuration()
  548. if oldStatus != task.Status {
  549. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  550. }
  551. err = models.UpdateJob(task)
  552. if err != nil {
  553. log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  554. resultCode = "-1"
  555. errorMsg = "system error"
  556. break
  557. }
  558. break
  559. }
  560. ctx.JSON(200, map[string]string{
  561. "result_code": resultCode,
  562. "error_msg": errorMsg,
  563. "status": status,
  564. "id": id,
  565. })
  566. }
  567. func NotebookDel(ctx *context.Context) {
  568. var listType = ctx.Query("debugListType")
  569. task := ctx.Cloudbrain
  570. if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsDeleted) {
  571. log.Error("the job(%s) has not been stopped", task.JobName)
  572. ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)
  573. return
  574. }
  575. var err error
  576. if task.Type == models.TypeCloudBrainTwo {
  577. _, err = modelarts.DelNotebook2(task.JobID)
  578. } else if task.Type == models.TypeCDCenter {
  579. _, err = modelarts_cd.DelNotebook(task.JobID)
  580. }
  581. if err != nil {
  582. log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error())
  583. if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) {
  584. log.Info("old notebook version")
  585. } else {
  586. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  587. return
  588. }
  589. }
  590. err = models.DeleteJob(task)
  591. if err != nil {
  592. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  593. return
  594. }
  595. var isAdminPage = ctx.Query("isadminpage")
  596. var isHomePage = ctx.Query("ishomepage")
  597. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  598. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  599. } else if isHomePage == "true" {
  600. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  601. } else {
  602. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  603. }
  604. }
  605. func TrainJobIndex(ctx *context.Context) {
  606. MustEnableModelArts(ctx)
  607. repo := ctx.Repo.Repository
  608. page := ctx.QueryInt("page")
  609. if page <= 0 {
  610. page = 1
  611. }
  612. listType := ctx.Query("listType")
  613. ctx.Data["ListType"] = listType
  614. if listType == models.AllResource {
  615. listType = ""
  616. }
  617. var jobTypes []string
  618. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  619. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  620. ListOptions: models.ListOptions{
  621. Page: page,
  622. PageSize: setting.UI.IssuePagingNum,
  623. },
  624. RepoID: repo.ID,
  625. JobTypeNot: false,
  626. JobTypes: jobTypes,
  627. IsLatestVersion: modelarts.IsLatestVersion,
  628. ComputeResource: listType,
  629. Type: models.TypeCloudBrainAll,
  630. })
  631. if err != nil {
  632. ctx.ServerError("Cloudbrain", err)
  633. return
  634. }
  635. for i, task := range tasks {
  636. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  637. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  638. }
  639. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  640. pager.SetDefaultParams(ctx)
  641. pager.AddParam(ctx, "listType", "ListType")
  642. ctx.Data["Page"] = pager
  643. ctx.Data["PageIsCloudBrain"] = true
  644. ctx.Data["Tasks"] = tasks
  645. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  646. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  647. ctx.HTML(200, tplModelArtsTrainJobIndex)
  648. }
  649. func TrainJobNew(ctx *context.Context) {
  650. err := trainJobNewDataPrepare(ctx)
  651. if err != nil {
  652. ctx.ServerError("get new train-job info failed", err)
  653. return
  654. }
  655. ctx.HTML(200, tplModelArtsTrainJobNew)
  656. }
  657. func trainJobNewDataPrepare(ctx *context.Context) error {
  658. ctx.Data["PageIsCloudBrain"] = true
  659. //can, err := canUserCreateTrainJob(ctx.User.ID)
  660. //if err != nil {
  661. // ctx.ServerError("canUserCreateTrainJob", err)
  662. // return
  663. //}
  664. //
  665. //if !can {
  666. // log.Error("the user can not create train-job")
  667. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  668. // return
  669. //}
  670. t := time.Now()
  671. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  672. ctx.Data["display_job_name"] = displayJobName
  673. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  674. if err != nil {
  675. ctx.ServerError("GetAllUserAttachments failed:", err)
  676. return err
  677. }
  678. ctx.Data["attachments"] = attachs
  679. var resourcePools modelarts.ResourcePool
  680. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  681. ctx.ServerError("json.Unmarshal failed:", err)
  682. return err
  683. }
  684. ctx.Data["resource_pools"] = resourcePools.Info
  685. var engines modelarts.Engine
  686. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  687. ctx.ServerError("json.Unmarshal failed:", err)
  688. return err
  689. }
  690. ctx.Data["engines"] = engines.Info
  691. var versionInfos modelarts.VersionInfo
  692. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  693. ctx.ServerError("json.Unmarshal failed:", err)
  694. return err
  695. }
  696. ctx.Data["engine_versions"] = versionInfos.Version
  697. prepareCloudbrainTwoTrainSpecs(ctx)
  698. ctx.Data["params"] = ""
  699. ctx.Data["branchName"] = ctx.Repo.BranchName
  700. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  701. if err != nil {
  702. ctx.ServerError("getConfigList failed:", err)
  703. return err
  704. }
  705. ctx.Data["config_list"] = configList.ParaConfigs
  706. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  707. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  708. ctx.Data["WaitCount"] = waitCount
  709. setMultiNodeIfConfigureMatch(ctx)
  710. return nil
  711. }
  712. func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) {
  713. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  714. JobType: models.JobTypeTrain,
  715. ComputeResource: models.NPU,
  716. Cluster: models.OpenICluster,
  717. AiCenterCode: models.AICenterOfCloudBrainTwo,
  718. })
  719. ctx.Data["Specs"] = noteBookSpecs
  720. }
  721. func setMultiNodeIfConfigureMatch(ctx *context.Context) {
  722. modelarts.InitMultiNode()
  723. if modelarts.MultiNodeConfig != nil {
  724. for _, info := range modelarts.MultiNodeConfig.Info {
  725. if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg {
  726. ctx.Data["WorkNode"] = info.Node
  727. break
  728. }
  729. }
  730. }
  731. }
  732. func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) {
  733. modelarts.InitSpecialPool()
  734. if modelarts.SpecialPools != nil {
  735. for _, specialPool := range modelarts.SpecialPools.Pools {
  736. if cloudbrain.IsElementExist(specialPool.JobType, jobType) {
  737. if isInOrg, _ := models.IsOrganizationMemberByOrgName(specialPool.Org, ctx.User.ID); isInOrg {
  738. var specialFlavor []struct {
  739. Code string
  740. Value string
  741. }
  742. if jobType == string(models.JobTypeDebug) {
  743. ctx.Data["flavors"] = specialPool.Flavor
  744. } else {
  745. for _, tempFlavor := range specialPool.Flavor {
  746. specialFlavor = append(specialFlavor, struct {
  747. Code string
  748. Value string
  749. }{Code: tempFlavor.Value, Value: tempFlavor.Desc})
  750. }
  751. ctx.Data["flavor_infos"] = specialFlavor
  752. }
  753. }
  754. }
  755. }
  756. }
  757. }
  758. func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  759. ctx.Data["PageIsCloudBrain"] = true
  760. //can, err := canUserCreateTrainJob(ctx.User.ID)
  761. //if err != nil {
  762. // ctx.ServerError("canUserCreateTrainJob", err)
  763. // return
  764. //}
  765. //
  766. //if !can {
  767. // log.Error("the user can not create train-job")
  768. // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
  769. // return
  770. //}
  771. t := time.Now()
  772. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  773. ctx.Data["display_job_name"] = displayJobName
  774. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  775. if err != nil {
  776. ctx.ServerError("GetAllUserAttachments failed:", err)
  777. return err
  778. }
  779. ctx.Data["attachments"] = attachs
  780. var resourcePools modelarts.ResourcePool
  781. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  782. ctx.ServerError("json.Unmarshal failed:", err)
  783. return err
  784. }
  785. ctx.Data["resource_pools"] = resourcePools.Info
  786. var engines modelarts.Engine
  787. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  788. ctx.ServerError("json.Unmarshal failed:", err)
  789. return err
  790. }
  791. ctx.Data["engines"] = engines.Info
  792. var versionInfos modelarts.VersionInfo
  793. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  794. ctx.ServerError("json.Unmarshal failed:", err)
  795. return err
  796. }
  797. ctx.Data["engine_versions"] = versionInfos.Version
  798. prepareCloudbrainTwoTrainSpecs(ctx)
  799. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  800. if err != nil {
  801. ctx.ServerError("getConfigList failed:", err)
  802. return err
  803. }
  804. var Parameters modelarts.Parameters
  805. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  806. ctx.ServerError("json.Unmarshal failed:", err)
  807. return err
  808. }
  809. ctx.Data["params"] = Parameters.Parameter
  810. ctx.Data["config_list"] = configList.ParaConfigs
  811. ctx.Data["bootFile"] = form.BootFile
  812. ctx.Data["uuid"] = form.Attachment
  813. _, datasetNames, err := models.GetDatasetInfo(form.Attachment)
  814. if err != nil {
  815. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  816. return nil
  817. }
  818. ctx.Data["dataset_name"] = datasetNames
  819. ctx.Data["branch_name"] = form.BranchName
  820. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  821. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  822. ctx.Data["WaitCount"] = waitCount
  823. setMultiNodeIfConfigureMatch(ctx)
  824. return nil
  825. }
  826. func TrainJobNewVersion(ctx *context.Context) {
  827. err := trainJobNewVersionDataPrepare(ctx)
  828. if err != nil {
  829. ctx.ServerError("get new train-job info failed", err)
  830. return
  831. }
  832. ctx.HTML(200, tplModelArtsTrainJobVersionNew)
  833. }
  834. func trainJobNewVersionDataPrepare(ctx *context.Context) error {
  835. ctx.Data["PageIsCloudBrain"] = true
  836. var jobID = ctx.Params(":jobid")
  837. var versionName = ctx.Query("version_name")
  838. // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
  839. // if err != nil {
  840. // ctx.ServerError("canNewJob can info failed", err)
  841. // return err
  842. // }
  843. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  844. if err != nil {
  845. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  846. return err
  847. }
  848. ctx.Data["display_job_name"] = task.DisplayJobName
  849. ctx.Data["job_name"] = task.JobName
  850. // attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  851. // if err != nil {
  852. // ctx.ServerError("GetAllUserAttachments failed:", err)
  853. // return err
  854. // }
  855. // ctx.Data["attachments"] = attachs
  856. var resourcePools modelarts.ResourcePool
  857. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  858. ctx.ServerError("json.Unmarshal failed:", err)
  859. return err
  860. }
  861. ctx.Data["resource_pools"] = resourcePools.Info
  862. var engines modelarts.Engine
  863. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  864. ctx.ServerError("json.Unmarshal failed:", err)
  865. return err
  866. }
  867. ctx.Data["engines"] = engines.Info
  868. var versionInfos modelarts.VersionInfo
  869. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  870. ctx.ServerError("json.Unmarshal failed:", err)
  871. return err
  872. }
  873. ctx.Data["engine_versions"] = versionInfos.Version
  874. prepareCloudbrainTwoTrainSpecs(ctx)
  875. spec, _ := resource.GetCloudbrainSpec(task.ID)
  876. if spec != nil {
  877. log.Info("spec_id = %d", spec.ID)
  878. ctx.Data["spec_id"] = spec.ID
  879. }
  880. var Parameters modelarts.Parameters
  881. if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
  882. ctx.ServerError("json.Unmarshal failed:", err)
  883. return err
  884. }
  885. ctx.Data["params"] = Parameters.Parameter
  886. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  887. if err != nil {
  888. ctx.ServerError("GetBranches error:", err)
  889. return err
  890. }
  891. _, _, datasetNames, _, err := getDatasUrlListByUUIDS(task.Uuid)
  892. if err != nil {
  893. log.Info("query dataset error," + err.Error())
  894. //ctx.ServerError("GetAllUserAttachments failed:", err)
  895. //return err
  896. } else {
  897. ctx.Data["dataset_name"] = datasetNames
  898. }
  899. ctx.Data["branches"] = branches
  900. ctx.Data["branch_name"] = task.BranchName
  901. ctx.Data["description"] = task.Description
  902. ctx.Data["boot_file"] = task.BootFile
  903. ctx.Data["work_server_number"] = task.WorkServerNumber
  904. ctx.Data["flavor_name"] = task.FlavorName
  905. ctx.Data["engine_name"] = task.EngineName
  906. ctx.Data["uuid"] = task.Uuid
  907. ctx.Data["flavor_code"] = task.FlavorCode
  908. ctx.Data["engine_id"] = task.EngineID
  909. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  910. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  911. if err != nil {
  912. ctx.ServerError("getConfigList failed:", err)
  913. return err
  914. }
  915. ctx.Data["config_list"] = configList.ParaConfigs
  916. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  917. ctx.Data["WaitCount"] = waitCount
  918. return nil
  919. }
  920. func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
  921. ctx.Data["PageIsCloudBrain"] = true
  922. var jobID = ctx.Params(":jobid")
  923. // var versionName = ctx.Params(":version-name")
  924. var versionName = ctx.Query("version_name")
  925. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  926. if err != nil {
  927. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  928. return err
  929. }
  930. t := time.Now()
  931. var jobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  932. ctx.Data["job_name"] = task.JobName
  933. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  934. if err != nil {
  935. ctx.ServerError("GetAllUserAttachments failed:", err)
  936. return err
  937. }
  938. ctx.Data["attachments"] = attachs
  939. var resourcePools modelarts.ResourcePool
  940. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  941. ctx.ServerError("json.Unmarshal failed:", err)
  942. return err
  943. }
  944. ctx.Data["resource_pools"] = resourcePools.Info
  945. var engines modelarts.Engine
  946. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  947. ctx.ServerError("json.Unmarshal failed:", err)
  948. return err
  949. }
  950. ctx.Data["engines"] = engines.Info
  951. var versionInfos modelarts.VersionInfo
  952. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  953. ctx.ServerError("json.Unmarshal failed:", err)
  954. return err
  955. }
  956. ctx.Data["engine_versions"] = versionInfos.Version
  957. prepareCloudbrainTwoTrainSpecs(ctx)
  958. var Parameters modelarts.Parameters
  959. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  960. ctx.ServerError("json.Unmarshal failed:", err)
  961. return err
  962. }
  963. ctx.Data["params"] = Parameters.Parameter
  964. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
  965. ctx.Data["train_url"] = outputObsPath
  966. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  967. if err != nil {
  968. ctx.ServerError("GetBranches error:", err)
  969. return err
  970. }
  971. ctx.Data["branches"] = branches
  972. ctx.Data["description"] = form.Description
  973. ctx.Data["dataset_name"] = task.DatasetName
  974. ctx.Data["work_server_number"] = form.WorkServerNumber
  975. ctx.Data["flavor_name"] = form.FlavorName
  976. ctx.Data["engine_name"] = form.EngineName
  977. ctx.Data["flavor_code"] = task.FlavorCode
  978. ctx.Data["engine_id"] = task.EngineID
  979. ctx.Data["version_name"] = form.VersionName
  980. ctx.Data["bootFile"] = form.BootFile
  981. ctx.Data["uuid"] = form.Attachment
  982. ctx.Data["branch_name"] = form.BranchName
  983. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  984. if err != nil {
  985. ctx.ServerError("getConfigList failed:", err)
  986. return err
  987. }
  988. ctx.Data["config_list"] = configList.ParaConfigs
  989. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  990. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  991. ctx.Data["WaitCount"] = waitCount
  992. return nil
  993. }
  994. func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  995. ctx.Data["PageIsTrainJob"] = true
  996. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  997. displayJobName := form.DisplayJobName
  998. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  999. uuid := form.Attachment
  1000. description := form.Description
  1001. workServerNumber := form.WorkServerNumber
  1002. engineID := form.EngineID
  1003. bootFile := strings.TrimSpace(form.BootFile)
  1004. params := form.Params
  1005. poolID := form.PoolID
  1006. //isSaveParam := form.IsSaveParam
  1007. repo := ctx.Repo.Repository
  1008. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1009. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  1010. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  1011. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1012. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1013. branchName := form.BranchName
  1014. isLatestVersion := modelarts.IsLatestVersion
  1015. FlavorName := form.FlavorName
  1016. VersionCount := modelarts.VersionCountOne
  1017. EngineName := form.EngineName
  1018. errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber)
  1019. if errStr != "" {
  1020. trainJobErrorNewDataPrepare(ctx, form)
  1021. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
  1022. return
  1023. }
  1024. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  1025. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  1026. if !isOk {
  1027. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  1028. trainJobErrorNewDataPrepare(ctx, form)
  1029. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobNew, &form)
  1030. return
  1031. }
  1032. defer lock.UnLock()
  1033. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  1034. if err != nil {
  1035. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1036. trainJobErrorNewDataPrepare(ctx, form)
  1037. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  1038. return
  1039. } else {
  1040. if count >= 1 {
  1041. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1042. trainJobErrorNewDataPrepare(ctx, form)
  1043. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
  1044. return
  1045. }
  1046. }
  1047. if err := paramCheckCreateTrainJob(form); err != nil {
  1048. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  1049. trainJobErrorNewDataPrepare(ctx, form)
  1050. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  1051. return
  1052. }
  1053. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  1054. if err != nil || !bootFileExist {
  1055. log.Error("Get bootfile error:", err)
  1056. trainJobErrorNewDataPrepare(ctx, form)
  1057. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobNew, &form)
  1058. return
  1059. }
  1060. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  1061. JobType: models.JobTypeTrain,
  1062. ComputeResource: models.NPU,
  1063. Cluster: models.OpenICluster,
  1064. AiCenterCode: models.AICenterOfCloudBrainTwo})
  1065. if err != nil || spec == nil {
  1066. trainJobErrorNewDataPrepare(ctx, form)
  1067. ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form)
  1068. return
  1069. }
  1070. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1071. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1072. cloudBrainNewDataPrepare(ctx)
  1073. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsTrainJobNew, &form)
  1074. return
  1075. }
  1076. //Determine whether the task name of the task in the project is duplicated
  1077. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  1078. if err == nil {
  1079. if len(tasks) != 0 {
  1080. log.Error("the job name did already exist", ctx.Data["MsgID"])
  1081. trainJobErrorNewDataPrepare(ctx, form)
  1082. ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
  1083. return
  1084. }
  1085. } else {
  1086. if !models.IsErrJobNotExist(err) {
  1087. log.Error("system error, %v", err, ctx.Data["MsgID"])
  1088. trainJobErrorNewDataPrepare(ctx, form)
  1089. ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
  1090. return
  1091. }
  1092. }
  1093. //todo: del the codeLocalPath
  1094. _, err = ioutil.ReadDir(codeLocalPath)
  1095. if err == nil {
  1096. os.RemoveAll(codeLocalPath)
  1097. }
  1098. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1099. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  1100. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  1101. log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
  1102. trainJobErrorNewDataPrepare(ctx, form)
  1103. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form)
  1104. return
  1105. }
  1106. //todo: upload code (send to file_server todo this work?)
  1107. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  1108. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  1109. trainJobErrorNewDataPrepare(ctx, form)
  1110. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
  1111. return
  1112. }
  1113. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1114. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1115. trainJobErrorNewDataPrepare(ctx, form)
  1116. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
  1117. return
  1118. }
  1119. // parentDir := VersionOutputPath + "/"
  1120. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1121. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1122. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1123. trainJobErrorNewDataPrepare(ctx, form)
  1124. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobNew, &form)
  1125. return
  1126. }
  1127. var parameters models.Parameters
  1128. param := make([]models.Parameter, 0)
  1129. existDeviceTarget := false
  1130. if len(params) != 0 {
  1131. err := json.Unmarshal([]byte(params), &parameters)
  1132. if err != nil {
  1133. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1134. trainJobErrorNewDataPrepare(ctx, form)
  1135. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
  1136. return
  1137. }
  1138. for _, parameter := range parameters.Parameter {
  1139. if parameter.Label == modelarts.DeviceTarget {
  1140. existDeviceTarget = true
  1141. }
  1142. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1143. param = append(param, models.Parameter{
  1144. Label: parameter.Label,
  1145. Value: parameter.Value,
  1146. })
  1147. }
  1148. }
  1149. }
  1150. if !existDeviceTarget {
  1151. param = append(param, models.Parameter{
  1152. Label: modelarts.DeviceTarget,
  1153. Value: modelarts.Ascend,
  1154. })
  1155. }
  1156. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  1157. if err != nil {
  1158. log.Error("Failed to getDatasUrlListByUUIDS: %v", err)
  1159. trainJobErrorNewDataPrepare(ctx, form)
  1160. ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1161. return
  1162. }
  1163. dataPath := dataUrl
  1164. jsondatas, err := json.Marshal(datasUrlList)
  1165. if err != nil {
  1166. log.Error("Failed to Marshal: %v", err)
  1167. trainJobErrorNewDataPrepare(ctx, form)
  1168. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1169. return
  1170. }
  1171. if isMultiDataset {
  1172. param = append(param, models.Parameter{
  1173. Label: modelarts.MultiDataUrl,
  1174. Value: string(jsondatas),
  1175. })
  1176. }
  1177. //save param config
  1178. // if isSaveParam == "on" {
  1179. // saveparams := append(param, models.Parameter{
  1180. // Label: modelarts.TrainUrl,
  1181. // Value: outputObsPath,
  1182. // }, models.Parameter{
  1183. // Label: modelarts.DataUrl,
  1184. // Value: dataPath,
  1185. // })
  1186. // if form.ParameterTemplateName == "" {
  1187. // log.Error("ParameterTemplateName is empty")
  1188. // trainJobNewDataPrepare(ctx)
  1189. // ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
  1190. // return
  1191. // }
  1192. // _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1193. // ConfigName: form.ParameterTemplateName,
  1194. // Description: form.PrameterDescription,
  1195. // DataUrl: dataPath,
  1196. // AppUrl: codeObsPath,
  1197. // BootFileUrl: codeObsPath + bootFile,
  1198. // TrainUrl: outputObsPath,
  1199. // Flavor: models.Flavor{
  1200. // Code: flavorCode,
  1201. // },
  1202. // WorkServerNum: workServerNumber,
  1203. // EngineID: int64(engineID),
  1204. // LogUrl: logObsPath,
  1205. // PoolID: poolID,
  1206. // Parameter: saveparams,
  1207. // })
  1208. // if err != nil {
  1209. // log.Error("Failed to CreateTrainJobConfig: %v", err)
  1210. // trainJobErrorNewDataPrepare(ctx, form)
  1211. // ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
  1212. // return
  1213. // }
  1214. // }
  1215. req := &modelarts.GenerateTrainJobReq{
  1216. JobName: jobName,
  1217. DisplayJobName: displayJobName,
  1218. DataUrl: dataPath,
  1219. Description: description,
  1220. CodeObsPath: codeObsPath,
  1221. BootFileUrl: codeObsPath + bootFile,
  1222. BootFile: bootFile,
  1223. TrainUrl: outputObsPath,
  1224. WorkServerNumber: workServerNumber,
  1225. EngineID: int64(engineID),
  1226. LogUrl: logObsPath,
  1227. PoolID: poolID,
  1228. Uuid: uuid,
  1229. Parameters: param,
  1230. CommitID: commitID,
  1231. IsLatestVersion: isLatestVersion,
  1232. BranchName: branchName,
  1233. Params: form.Params,
  1234. FlavorName: FlavorName,
  1235. EngineName: EngineName,
  1236. VersionCount: VersionCount,
  1237. TotalVersionCount: modelarts.TotalVersionCount,
  1238. DatasetName: datasetNames,
  1239. Spec: spec,
  1240. }
  1241. userCommand, userImageUrl := getUserCommand(engineID, req)
  1242. req.UserCommand = userCommand
  1243. req.UserImageUrl = userImageUrl
  1244. //将params转换Parameters.Parameter,出错时返回给前端
  1245. var Parameters modelarts.Parameters
  1246. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  1247. ctx.ServerError("json.Unmarshal failed:", err)
  1248. return
  1249. }
  1250. err = modelarts.GenerateTrainJob(ctx, req)
  1251. if err != nil {
  1252. log.Error("GenerateTrainJob failed:%v", err.Error())
  1253. trainJobErrorNewDataPrepare(ctx, form)
  1254. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
  1255. return
  1256. }
  1257. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  1258. }
  1259. func checkMultiNode(userId int64, serverNum int) string {
  1260. if serverNum == 1 {
  1261. return ""
  1262. }
  1263. modelarts.InitMultiNode()
  1264. var isServerNumValid = false
  1265. if modelarts.MultiNodeConfig != nil {
  1266. for _, info := range modelarts.MultiNodeConfig.Info {
  1267. if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg {
  1268. if isInNodes(info.Node, serverNum) {
  1269. isServerNumValid = true
  1270. break
  1271. }
  1272. }
  1273. }
  1274. }
  1275. if isServerNumValid {
  1276. return ""
  1277. } else {
  1278. return "repo.modelarts.no_node_right"
  1279. }
  1280. }
  1281. func checkInferenceJobMultiNode(userId int64, serverNum int) string {
  1282. if serverNum == 1 {
  1283. return ""
  1284. }
  1285. return "repo.modelarts.no_node_right"
  1286. }
  1287. func isInNodes(nodes []int, num int) bool {
  1288. for _, node := range nodes {
  1289. if node == num {
  1290. return true
  1291. }
  1292. }
  1293. return false
  1294. }
  1295. func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) {
  1296. userImageUrl := ""
  1297. userCommand := ""
  1298. if engineId < 0 {
  1299. tmpCodeObsPath := strings.Trim(req.CodeObsPath, "/")
  1300. tmpCodeObsPaths := strings.Split(tmpCodeObsPath, "/")
  1301. lastCodeDir := "code"
  1302. if len(tmpCodeObsPaths) > 0 {
  1303. lastCodeDir = tmpCodeObsPaths[len(tmpCodeObsPaths)-1]
  1304. }
  1305. userCommand = "/bin/bash /home/work/run_train.sh 's3://" + req.CodeObsPath + "' '" + lastCodeDir + "/" + req.BootFile + "' '/tmp/log/train.log' --'data_url'='s3://" + req.DataUrl + "' --'train_url'='s3://" + req.TrainUrl + "'"
  1306. var versionInfos modelarts.VersionInfo
  1307. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  1308. log.Info("json parse err." + err.Error())
  1309. } else {
  1310. for _, engine := range versionInfos.Version {
  1311. if engine.ID == engineId {
  1312. userImageUrl = engine.Url
  1313. break
  1314. }
  1315. }
  1316. }
  1317. for _, param := range req.Parameters {
  1318. userCommand += " --'" + param.Label + "'='" + param.Value + "'"
  1319. }
  1320. return userCommand, userImageUrl
  1321. }
  1322. return userCommand, userImageUrl
  1323. }
  1324. func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
  1325. ctx.Data["PageIsTrainJob"] = true
  1326. var jobID = ctx.Params(":jobid")
  1327. errStr := checkMultiNode(ctx.User.ID, form.WorkServerNumber)
  1328. if errStr != "" {
  1329. versionErrorDataPrepare(ctx, form)
  1330. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
  1331. return
  1332. }
  1333. count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
  1334. if err != nil {
  1335. log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1336. versionErrorDataPrepare(ctx, form)
  1337. ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
  1338. return
  1339. } else {
  1340. if count >= 1 {
  1341. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1342. versionErrorDataPrepare(ctx, form)
  1343. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
  1344. return
  1345. }
  1346. }
  1347. latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
  1348. if err != nil {
  1349. ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
  1350. return
  1351. }
  1352. VersionOutputPath := modelarts.GetOutputPathByCount(latestTask.TotalVersionCount + 1)
  1353. displayJobName := form.DisplayJobName
  1354. jobName := form.JobName
  1355. uuid := form.Attachment
  1356. description := form.Description
  1357. workServerNumber := form.WorkServerNumber
  1358. engineID := form.EngineID
  1359. bootFile := strings.TrimSpace(form.BootFile)
  1360. params := form.Params
  1361. poolID := form.PoolID
  1362. //isSaveParam := form.IsSaveParam
  1363. repo := ctx.Repo.Repository
  1364. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1365. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
  1366. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
  1367. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1368. // dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1369. branchName := form.BranchName
  1370. PreVersionName := form.VersionName
  1371. FlavorName := form.FlavorName
  1372. EngineName := form.EngineName
  1373. isLatestVersion := modelarts.IsLatestVersion
  1374. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  1375. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  1376. if !isOk {
  1377. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  1378. versionErrorDataPrepare(ctx, form)
  1379. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsTrainJobVersionNew, &form)
  1380. return
  1381. }
  1382. defer lock.UnLock()
  1383. canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
  1384. if !canNewJob {
  1385. versionErrorDataPrepare(ctx, form)
  1386. ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
  1387. return
  1388. }
  1389. if err := paramCheckCreateTrainJob(form); err != nil {
  1390. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  1391. versionErrorDataPrepare(ctx, form)
  1392. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1393. return
  1394. }
  1395. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  1396. if err != nil || !bootFileExist {
  1397. log.Error("Get bootfile error:", err)
  1398. versionErrorDataPrepare(ctx, form)
  1399. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsTrainJobVersionNew, &form)
  1400. return
  1401. }
  1402. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  1403. JobType: models.JobTypeTrain,
  1404. ComputeResource: models.NPU,
  1405. Cluster: models.OpenICluster,
  1406. AiCenterCode: models.AICenterOfCloudBrainTwo})
  1407. if err != nil || spec == nil {
  1408. versionErrorDataPrepare(ctx, form)
  1409. ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form)
  1410. return
  1411. }
  1412. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1413. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1414. versionErrorDataPrepare(ctx, form)
  1415. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsTrainJobVersionNew, &form)
  1416. return
  1417. }
  1418. //todo: del the codeLocalPath
  1419. _, err = ioutil.ReadDir(codeLocalPath)
  1420. if err == nil {
  1421. os.RemoveAll(codeLocalPath)
  1422. }
  1423. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  1424. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  1425. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  1426. log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err)
  1427. versionErrorDataPrepare(ctx, form)
  1428. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form)
  1429. return
  1430. }
  1431. //todo: upload code (send to file_server todo this work?)
  1432. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  1433. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  1434. versionErrorDataPrepare(ctx, form)
  1435. ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
  1436. return
  1437. }
  1438. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  1439. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  1440. versionErrorDataPrepare(ctx, form)
  1441. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
  1442. return
  1443. }
  1444. parentDir := VersionOutputPath + "/"
  1445. // parentDir := ""
  1446. // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  1447. if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  1448. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  1449. versionErrorDataPrepare(ctx, form)
  1450. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsTrainJobVersionNew, &form)
  1451. return
  1452. }
  1453. //todo: del local code?
  1454. var parameters models.Parameters
  1455. param := make([]models.Parameter, 0)
  1456. existDeviceTarget := false
  1457. if len(params) != 0 {
  1458. err := json.Unmarshal([]byte(params), &parameters)
  1459. if err != nil {
  1460. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1461. versionErrorDataPrepare(ctx, form)
  1462. ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
  1463. return
  1464. }
  1465. for _, parameter := range parameters.Parameter {
  1466. if parameter.Label == modelarts.DeviceTarget {
  1467. existDeviceTarget = true
  1468. }
  1469. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  1470. param = append(param, models.Parameter{
  1471. Label: parameter.Label,
  1472. Value: parameter.Value,
  1473. })
  1474. }
  1475. }
  1476. }
  1477. if !existDeviceTarget {
  1478. param = append(param, models.Parameter{
  1479. Label: modelarts.DeviceTarget,
  1480. Value: modelarts.Ascend,
  1481. })
  1482. }
  1483. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  1484. if err != nil {
  1485. log.Error("Failed to getDatasUrlListByUUIDS: %v", err)
  1486. versionErrorDataPrepare(ctx, form)
  1487. ctx.RenderWithErr("Failed to getDatasUrlListByUUIDS:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1488. return
  1489. }
  1490. dataPath := dataUrl
  1491. jsondatas, err := json.Marshal(datasUrlList)
  1492. if err != nil {
  1493. log.Error("Failed to Marshal: %v", err)
  1494. versionErrorDataPrepare(ctx, form)
  1495. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1496. return
  1497. }
  1498. if isMultiDataset {
  1499. param = append(param, models.Parameter{
  1500. Label: modelarts.MultiDataUrl,
  1501. Value: string(jsondatas),
  1502. })
  1503. }
  1504. // //save param config
  1505. // if isSaveParam == "on" {
  1506. // saveparams := append(param, models.Parameter{
  1507. // Label: modelarts.TrainUrl,
  1508. // Value: outputObsPath,
  1509. // }, models.Parameter{
  1510. // Label: modelarts.DataUrl,
  1511. // Value: dataPath,
  1512. // })
  1513. // if form.ParameterTemplateName == "" {
  1514. // log.Error("ParameterTemplateName is empty")
  1515. // versionErrorDataPrepare(ctx, form)
  1516. // ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
  1517. // return
  1518. // }
  1519. // _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
  1520. // ConfigName: form.ParameterTemplateName,
  1521. // Description: form.PrameterDescription,
  1522. // DataUrl: dataPath,
  1523. // AppUrl: codeObsPath,
  1524. // BootFileUrl: codeObsPath + bootFile,
  1525. // TrainUrl: outputObsPath,
  1526. // Flavor: models.Flavor{
  1527. // Code: flavorCode,
  1528. // },
  1529. // WorkServerNum: workServerNumber,
  1530. // EngineID: int64(engineID),
  1531. // LogUrl: logObsPath,
  1532. // PoolID: poolID,
  1533. // Parameter: saveparams,
  1534. // })
  1535. // if err != nil {
  1536. // log.Error("Failed to CreateTrainJobConfig: %v", err)
  1537. // versionErrorDataPrepare(ctx, form)
  1538. // ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1539. // return
  1540. // }
  1541. // }
  1542. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
  1543. if err != nil {
  1544. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
  1545. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1546. return
  1547. }
  1548. req := &modelarts.GenerateTrainJobReq{
  1549. JobName: jobName,
  1550. DisplayJobName: displayJobName,
  1551. DataUrl: dataPath,
  1552. Description: description,
  1553. CodeObsPath: codeObsPath,
  1554. BootFileUrl: codeObsPath + bootFile,
  1555. BootFile: bootFile,
  1556. TrainUrl: outputObsPath,
  1557. WorkServerNumber: workServerNumber,
  1558. IsLatestVersion: isLatestVersion,
  1559. EngineID: int64(engineID),
  1560. LogUrl: logObsPath,
  1561. PoolID: poolID,
  1562. Uuid: uuid,
  1563. Params: form.Params,
  1564. Parameters: param,
  1565. PreVersionId: task.VersionID,
  1566. CommitID: commitID,
  1567. BranchName: branchName,
  1568. FlavorName: FlavorName,
  1569. EngineName: EngineName,
  1570. PreVersionName: PreVersionName,
  1571. TotalVersionCount: latestTask.TotalVersionCount + 1,
  1572. DatasetName: datasetNames,
  1573. Spec: spec,
  1574. }
  1575. userCommand, userImageUrl := getUserCommand(engineID, req)
  1576. req.UserCommand = userCommand
  1577. req.UserImageUrl = userImageUrl
  1578. err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
  1579. if err != nil {
  1580. log.Error("GenerateTrainJob failed:%v", err.Error())
  1581. versionErrorDataPrepare(ctx, form)
  1582. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
  1583. return
  1584. }
  1585. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
  1586. // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1587. }
  1588. // readDir reads the directory named by dirname and returns
  1589. // a list of directory entries sorted by filename.
  1590. func readDir(dirname string) ([]os.FileInfo, error) {
  1591. f, err := os.Open(dirname)
  1592. if err != nil {
  1593. return nil, err
  1594. }
  1595. list, err := f.Readdir(0)
  1596. f.Close()
  1597. if err != nil {
  1598. //todo: can not upload empty folder
  1599. if err == io.EOF {
  1600. return nil, nil
  1601. }
  1602. return nil, err
  1603. }
  1604. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  1605. return list, nil
  1606. }
  1607. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  1608. files, err := readDir(codePath)
  1609. if err != nil {
  1610. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  1611. return err
  1612. }
  1613. for _, file := range files {
  1614. if file.IsDir() {
  1615. input := &obs.PutObjectInput{}
  1616. input.Bucket = setting.Bucket
  1617. input.Key = parentDir + file.Name() + "/"
  1618. _, err = storage.ObsCli.PutObject(input)
  1619. if err != nil {
  1620. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1621. return err
  1622. }
  1623. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  1624. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  1625. return err
  1626. }
  1627. } else {
  1628. input := &obs.PutFileInput{}
  1629. input.Bucket = setting.Bucket
  1630. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  1631. input.SourceFile = codePath + file.Name()
  1632. _, err = storage.ObsCli.PutFile(input)
  1633. if err != nil {
  1634. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  1635. return err
  1636. }
  1637. }
  1638. }
  1639. return nil
  1640. }
  1641. func obsMkdir(dir string) error {
  1642. input := &obs.PutObjectInput{}
  1643. input.Bucket = setting.Bucket
  1644. input.Key = dir
  1645. _, err := storage.ObsCli.PutObject(input)
  1646. if err != nil {
  1647. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  1648. return err
  1649. }
  1650. return nil
  1651. }
  1652. func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
  1653. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1654. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1655. return errors.New("启动文件必须是python文件")
  1656. }
  1657. if form.BranchName == "" {
  1658. log.Error("the branch must not be null!", form.BranchName)
  1659. return errors.New("代码分支不能为空!")
  1660. }
  1661. return nil
  1662. }
  1663. func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) error {
  1664. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  1665. log.Error("the boot file(%s) must be a python file", strings.TrimSpace(form.BootFile))
  1666. return errors.New("启动文件必须是python文件")
  1667. }
  1668. if form.WorkServerNumber > 2 || form.WorkServerNumber < 1 {
  1669. log.Error("the WorkServerNumber(%d) must be in (1,2)", form.WorkServerNumber)
  1670. return errors.New("计算节点数必须在1-2之间")
  1671. }
  1672. if form.ModelName == "" {
  1673. log.Error("the ModelName(%d) must not be nil", form.ModelName)
  1674. return errors.New("模型名称不能为空")
  1675. }
  1676. if form.ModelVersion == "" {
  1677. log.Error("the ModelVersion(%d) must not be nil", form.ModelVersion)
  1678. return errors.New("模型版本不能为空")
  1679. }
  1680. if form.CkptName == "" {
  1681. log.Error("the CkptName(%d) must not be nil", form.CkptName)
  1682. return errors.New("权重文件不能为空")
  1683. }
  1684. if form.BranchName == "" {
  1685. log.Error("the Branch(%d) must not be nil", form.BranchName)
  1686. return errors.New("分支名不能为空")
  1687. }
  1688. if utf8.RuneCountInString(form.Description) > 255 {
  1689. log.Error("the Description length(%d) must not more than 255", form.Description)
  1690. return errors.New("描述字符不能超过255个字符")
  1691. }
  1692. return nil
  1693. }
  1694. func TrainJobShow(ctx *context.Context) {
  1695. ctx.Data["PageIsCloudBrain"] = true
  1696. var jobID = ctx.Params(":jobid")
  1697. repo := ctx.Repo.Repository
  1698. page := ctx.QueryInt("page")
  1699. if page <= 0 {
  1700. page = 1
  1701. }
  1702. var jobTypes []string
  1703. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1704. VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1705. ListOptions: models.ListOptions{
  1706. Page: page,
  1707. PageSize: setting.UI.IssuePagingNum,
  1708. },
  1709. RepoID: repo.ID,
  1710. Type: models.TypeCloudBrainTwo,
  1711. JobTypes: jobTypes,
  1712. JobID: jobID,
  1713. })
  1714. if err != nil {
  1715. log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
  1716. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1717. return
  1718. }
  1719. if len(VersionListTasks) == 0 {
  1720. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1721. return
  1722. }
  1723. //设置权限
  1724. canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
  1725. if err != nil {
  1726. ctx.ServerError("canNewJob failed", err)
  1727. return
  1728. }
  1729. ctx.Data["canNewJob"] = canNewJob
  1730. datasetList := make([][]models.DatasetDownload, 0)
  1731. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  1732. for i, task := range VersionListTasks {
  1733. var parameters models.Parameters
  1734. err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), &parameters)
  1735. if err != nil {
  1736. log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
  1737. trainJobNewDataPrepare(ctx)
  1738. return
  1739. }
  1740. if len(parameters.Parameter) > 0 {
  1741. paramTemp := ""
  1742. for _, Parameter := range parameters.Parameter {
  1743. param := Parameter.Label + " = " + Parameter.Value + "; "
  1744. paramTemp = paramTemp + param
  1745. }
  1746. VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
  1747. } else {
  1748. VersionListTasks[i].Parameters = ""
  1749. }
  1750. datasetList = append(datasetList, GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false))
  1751. VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  1752. VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  1753. //add spec
  1754. s, err := resource.GetCloudbrainSpec(task.Cloudbrain.ID)
  1755. if err != nil {
  1756. log.Error("TrainJobShow GetCloudbrainSpec error:" + err.Error())
  1757. continue
  1758. }
  1759. VersionListTasks[i].Cloudbrain.Spec = s
  1760. }
  1761. pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
  1762. pager.SetDefaultParams(ctx)
  1763. ctx.Data["Page"] = pager
  1764. ctx.Data["jobID"] = jobID
  1765. ctx.Data["displayJobName"] = VersionListTasks[0].DisplayJobName
  1766. ctx.Data["version_list_task"] = VersionListTasks
  1767. ctx.Data["version_list_count"] = VersionListCount
  1768. ctx.Data["datasetList"] = datasetList
  1769. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, &VersionListTasks[0].Cloudbrain)
  1770. ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1771. }
  1772. func TrainJobDel(ctx *context.Context) {
  1773. var jobID = ctx.Params(":jobid")
  1774. var listType = ctx.Query("listType")
  1775. repo := ctx.Repo.Repository
  1776. var jobTypes []string
  1777. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  1778. VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  1779. RepoID: repo.ID,
  1780. Type: models.TypeCloudBrainTwo,
  1781. JobTypes: jobTypes,
  1782. JobID: jobID,
  1783. })
  1784. if err != nil {
  1785. ctx.ServerError("get VersionListTasks failed", err)
  1786. return
  1787. }
  1788. for _, task := range VersionListTasks {
  1789. if task.Status != string(models.ModelArtsTrainJobImageFailed) && task.Status != string(models.ModelArtsTrainJobSubmitFailed) && task.Status != string(models.ModelArtsTrainJobDeleteFailed) &&
  1790. task.Status != string(models.ModelArtsTrainJobCompleted) && task.Status != string(models.ModelArtsTrainJobFailed) &&
  1791. task.Status != string(models.ModelArtsTrainJobKilled) && task.Status != string(models.ModelArtsTrainJobCanceled) && task.Status != string(models.ModelArtsTrainJobLost) {
  1792. log.Error("the job(%s) version has not been stopped", task.JobName)
  1793. ctx.RenderWithErr("the job version has not been stopped", tplModelArtsTrainJobIndex, nil)
  1794. return
  1795. }
  1796. }
  1797. //删除modelarts上的任务记录
  1798. _, err = modelarts.DelTrainJob(jobID)
  1799. if err != nil {
  1800. log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
  1801. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1802. return
  1803. }
  1804. //删除数据库Cloudbrain表的记录
  1805. for _, task := range VersionListTasks {
  1806. err = models.DeleteJob(&task.Cloudbrain)
  1807. if err != nil {
  1808. ctx.ServerError("DeleteJob failed", err)
  1809. return
  1810. }
  1811. }
  1812. //删除存储
  1813. if len(VersionListTasks) > 0 {
  1814. DeleteJobStorage(VersionListTasks[0].JobName)
  1815. }
  1816. var isAdminPage = ctx.Query("isadminpage")
  1817. var isHomePage = ctx.Query("ishomepage")
  1818. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  1819. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  1820. } else if isHomePage == "true" {
  1821. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  1822. } else {
  1823. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1824. }
  1825. }
  1826. func TrainJobStop(ctx *context.Context) {
  1827. var jobID = ctx.Params(":jobid")
  1828. var listType = ctx.Query("listType")
  1829. task := ctx.Cloudbrain
  1830. _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
  1831. if err != nil {
  1832. log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
  1833. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
  1834. return
  1835. }
  1836. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  1837. }
  1838. func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
  1839. if ctx == nil || ctx.User == nil {
  1840. log.Error("user unlogin!")
  1841. return false, nil
  1842. }
  1843. if userID == ctx.User.ID || ctx.User.IsAdmin {
  1844. return true, nil
  1845. } else {
  1846. log.Error("Only user itself and admin can new trainjob!")
  1847. return false, nil
  1848. }
  1849. }
  1850. func TrainJobGetConfigList(ctx *context.Context) {
  1851. ctx.Data["PageIsTrainJob"] = true
  1852. var jobID = ctx.Params(":jobid")
  1853. var logFileName = ctx.Query("file_name")
  1854. var baseLine = ctx.Query("base_line")
  1855. var order = ctx.Query("order")
  1856. if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
  1857. log.Error("order(%s) check failed", order)
  1858. ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
  1859. return
  1860. }
  1861. task, err := models.GetCloudbrainByJobID(jobID)
  1862. if err != nil {
  1863. log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
  1864. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1865. return
  1866. }
  1867. result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
  1868. if err != nil {
  1869. log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
  1870. ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
  1871. return
  1872. }
  1873. ctx.Data["log"] = result
  1874. //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
  1875. }
  1876. func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
  1877. var result models.GetConfigListResult
  1878. list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
  1879. if err != nil {
  1880. log.Error("GetConfigList failed:", err)
  1881. return &result, err
  1882. }
  1883. for _, config := range list.ParaConfigs {
  1884. paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
  1885. if err != nil {
  1886. log.Error("GetParaConfig failed:", err)
  1887. return &result, err
  1888. }
  1889. config.Result = paraConfig
  1890. }
  1891. return list, nil
  1892. }
  1893. func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) {
  1894. ctx.Data["PageIsTrainJob"] = true
  1895. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  1896. displayJobName := form.DisplayJobName
  1897. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  1898. uuid := form.Attachment
  1899. description := form.Description
  1900. workServerNumber := form.WorkServerNumber
  1901. engineID := form.EngineID
  1902. bootFile := strings.TrimSpace(form.BootFile)
  1903. params := form.Params
  1904. poolID := form.PoolID
  1905. repo := ctx.Repo.Repository
  1906. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  1907. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  1908. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  1909. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  1910. //dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  1911. branchName := form.BranchName
  1912. FlavorName := form.FlavorName
  1913. EngineName := form.EngineName
  1914. LabelName := form.LabelName
  1915. isLatestVersion := modelarts.IsLatestVersion
  1916. VersionCount := modelarts.VersionCountOne
  1917. trainUrl := form.TrainUrl
  1918. modelName := form.ModelName
  1919. modelVersion := form.ModelVersion
  1920. ckptName := form.CkptName
  1921. ckptUrl := "/" + form.TrainUrl + form.CkptName
  1922. log.Info("ckpt url:" + ckptUrl)
  1923. errStr := checkInferenceJobMultiNode(ctx.User.ID, form.WorkServerNumber)
  1924. if errStr != "" {
  1925. inferenceJobErrorNewDataPrepare(ctx, form)
  1926. ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)
  1927. return
  1928. }
  1929. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeInference), displayJobName))
  1930. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  1931. if !isOk {
  1932. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  1933. inferenceJobErrorNewDataPrepare(ctx, form)
  1934. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplModelArtsInferenceJobNew, &form)
  1935. return
  1936. }
  1937. defer lock.UnLock()
  1938. count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID)
  1939. if err != nil {
  1940. log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1941. inferenceJobErrorNewDataPrepare(ctx, form)
  1942. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1943. return
  1944. } else {
  1945. if count >= 1 {
  1946. log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])
  1947. inferenceJobErrorNewDataPrepare(ctx, form)
  1948. ctx.RenderWithErr("you have already a running or waiting inference task, can not create more", tplModelArtsInferenceJobNew, &form)
  1949. return
  1950. }
  1951. }
  1952. if err := paramCheckCreateInferenceJob(form); err != nil {
  1953. log.Error("paramCheckCreateInferenceJob failed:(%v)", err)
  1954. inferenceJobErrorNewDataPrepare(ctx, form)
  1955. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  1956. return
  1957. }
  1958. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  1959. if err != nil || !bootFileExist {
  1960. log.Error("Get bootfile error:", err)
  1961. inferenceJobErrorNewDataPrepare(ctx, form)
  1962. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplModelArtsInferenceJobNew, &form)
  1963. return
  1964. }
  1965. //Determine whether the task name of the task in the project is duplicated
  1966. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeInference), displayJobName)
  1967. if err == nil {
  1968. if len(tasks) != 0 {
  1969. log.Error("the job name did already exist", ctx.Data["MsgID"])
  1970. inferenceJobErrorNewDataPrepare(ctx, form)
  1971. ctx.RenderWithErr("the job name did already exist", tplModelArtsInferenceJobNew, &form)
  1972. return
  1973. }
  1974. } else {
  1975. if !models.IsErrJobNotExist(err) {
  1976. log.Error("system error, %v", err, ctx.Data["MsgID"])
  1977. inferenceJobErrorNewDataPrepare(ctx, form)
  1978. ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
  1979. return
  1980. }
  1981. }
  1982. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  1983. JobType: models.JobTypeInference,
  1984. ComputeResource: models.NPU,
  1985. Cluster: models.OpenICluster,
  1986. AiCenterCode: models.AICenterOfCloudBrainTwo})
  1987. if err != nil || spec == nil {
  1988. inferenceJobErrorNewDataPrepare(ctx, form)
  1989. ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form)
  1990. return
  1991. }
  1992. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1993. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  1994. inferenceJobErrorNewDataPrepare(ctx, form)
  1995. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplModelArtsInferenceJobNew, &form)
  1996. return
  1997. }
  1998. //todo: del the codeLocalPath
  1999. _, err = ioutil.ReadDir(codeLocalPath)
  2000. if err == nil {
  2001. os.RemoveAll(codeLocalPath)
  2002. }
  2003. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  2004. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  2005. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  2006. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  2007. inferenceJobErrorNewDataPrepare(ctx, form)
  2008. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsInferenceJobNew, &form)
  2009. return
  2010. }
  2011. //todo: upload code (send to file_server todo this work?)
  2012. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  2013. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  2014. inferenceJobErrorNewDataPrepare(ctx, form)
  2015. ctx.RenderWithErr("Failed to obsMkdir_result", tplModelArtsInferenceJobNew, &form)
  2016. return
  2017. }
  2018. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  2019. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  2020. inferenceJobErrorNewDataPrepare(ctx, form)
  2021. ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsInferenceJobNew, &form)
  2022. return
  2023. }
  2024. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  2025. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  2026. inferenceJobErrorNewDataPrepare(ctx, form)
  2027. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplModelArtsInferenceJobNew, &form)
  2028. return
  2029. }
  2030. var parameters models.Parameters
  2031. param := make([]models.Parameter, 0)
  2032. param = append(param, models.Parameter{
  2033. Label: modelarts.ResultUrl,
  2034. Value: "s3:/" + resultObsPath,
  2035. }, models.Parameter{
  2036. Label: modelarts.CkptUrl,
  2037. Value: "s3:/" + ckptUrl,
  2038. })
  2039. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  2040. if err != nil {
  2041. inferenceJobErrorNewDataPrepare(ctx, form)
  2042. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  2043. return
  2044. }
  2045. dataPath := dataUrl
  2046. jsondatas, err := json.Marshal(datasUrlList)
  2047. if err != nil {
  2048. log.Error("Failed to Marshal: %v", err)
  2049. inferenceJobErrorNewDataPrepare(ctx, form)
  2050. ctx.RenderWithErr("json error:"+err.Error(), tplModelArtsInferenceJobNew, &form)
  2051. return
  2052. }
  2053. if isMultiDataset {
  2054. param = append(param, models.Parameter{
  2055. Label: modelarts.MultiDataUrl,
  2056. Value: string(jsondatas),
  2057. })
  2058. }
  2059. existDeviceTarget := false
  2060. if len(params) != 0 {
  2061. err := json.Unmarshal([]byte(params), &parameters)
  2062. if err != nil {
  2063. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  2064. inferenceJobErrorNewDataPrepare(ctx, form)
  2065. ctx.RenderWithErr("运行参数错误", tplModelArtsInferenceJobNew, &form)
  2066. return
  2067. }
  2068. for _, parameter := range parameters.Parameter {
  2069. if parameter.Label == modelarts.DeviceTarget {
  2070. existDeviceTarget = true
  2071. }
  2072. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  2073. param = append(param, models.Parameter{
  2074. Label: parameter.Label,
  2075. Value: parameter.Value,
  2076. })
  2077. }
  2078. }
  2079. }
  2080. if !existDeviceTarget {
  2081. param = append(param, models.Parameter{
  2082. Label: modelarts.DeviceTarget,
  2083. Value: modelarts.Ascend,
  2084. })
  2085. }
  2086. req := &modelarts.GenerateInferenceJobReq{
  2087. JobName: jobName,
  2088. DisplayJobName: displayJobName,
  2089. DataUrl: dataPath,
  2090. Description: description,
  2091. CodeObsPath: codeObsPath,
  2092. BootFileUrl: codeObsPath + bootFile,
  2093. BootFile: bootFile,
  2094. TrainUrl: trainUrl,
  2095. WorkServerNumber: workServerNumber,
  2096. EngineID: int64(engineID),
  2097. LogUrl: logObsPath,
  2098. PoolID: poolID,
  2099. Uuid: uuid,
  2100. Parameters: param, //modelarts train parameters
  2101. CommitID: commitID,
  2102. BranchName: branchName,
  2103. Params: form.Params,
  2104. FlavorName: FlavorName,
  2105. EngineName: EngineName,
  2106. LabelName: LabelName,
  2107. IsLatestVersion: isLatestVersion,
  2108. VersionCount: VersionCount,
  2109. TotalVersionCount: modelarts.TotalVersionCount,
  2110. ModelName: modelName,
  2111. ModelVersion: modelVersion,
  2112. CkptName: ckptName,
  2113. ResultUrl: resultObsPath,
  2114. Spec: spec,
  2115. DatasetName: datasetNames,
  2116. }
  2117. err = modelarts.GenerateInferenceJob(ctx, req)
  2118. if err != nil {
  2119. log.Error("GenerateTrainJob failed:%v", err.Error())
  2120. inferenceJobErrorNewDataPrepare(ctx, form)
  2121. ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
  2122. return
  2123. }
  2124. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
  2125. }
  2126. func checkModelArtsSpecialPool(ctx *context.Context, flavorCode string, jobType string) string {
  2127. if modelarts.SpecialPools != nil {
  2128. isMatchPool := false
  2129. for _, specialPool := range modelarts.SpecialPools.Pools {
  2130. if cloudbrain.IsElementExist(specialPool.JobType, jobType) {
  2131. if isInOrg, _ := models.IsOrganizationMemberByOrgName(specialPool.Org, ctx.User.ID); isInOrg {
  2132. isMatchPool = true
  2133. isMatchSpec := false
  2134. for _, flavor := range specialPool.Flavor {
  2135. if flavor.Value == flavorCode {
  2136. isMatchSpec = true
  2137. break
  2138. }
  2139. }
  2140. if !isMatchSpec {
  2141. return "cloudbrain.wrong_specification"
  2142. }
  2143. }
  2144. }
  2145. }
  2146. if !isMatchPool {
  2147. isMatchSpec := false
  2148. if jobType == string(models.JobTypeDebug) {
  2149. for _, flavor := range setting.StFlavorInfo.FlavorInfo {
  2150. if flavor.Value == flavorCode {
  2151. isMatchSpec = true
  2152. break
  2153. }
  2154. }
  2155. } else {
  2156. var flavorInfos modelarts.Flavor
  2157. json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos)
  2158. for _, flavor := range flavorInfos.Info {
  2159. if flavor.Code == flavorCode {
  2160. isMatchSpec = true
  2161. break
  2162. }
  2163. }
  2164. }
  2165. if !isMatchSpec {
  2166. return "cloudbrain.wrong_specification"
  2167. }
  2168. }
  2169. }
  2170. return ""
  2171. }
  2172. func InferenceJobIndex(ctx *context.Context) {
  2173. MustEnableModelArts(ctx)
  2174. repo := ctx.Repo.Repository
  2175. page := ctx.QueryInt("page")
  2176. if page <= 0 {
  2177. page = 1
  2178. }
  2179. listType := ctx.Query("listType")
  2180. ctx.Data["ListType"] = listType
  2181. if listType == models.AllResource {
  2182. listType = ""
  2183. }
  2184. var jobTypes []string
  2185. jobTypes = append(jobTypes, string(models.JobTypeInference))
  2186. tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
  2187. ListOptions: models.ListOptions{
  2188. Page: page,
  2189. PageSize: setting.UI.IssuePagingNum,
  2190. },
  2191. RepoID: repo.ID,
  2192. ComputeResource: listType,
  2193. JobTypes: jobTypes,
  2194. Type: models.TypeCloudBrainAll,
  2195. })
  2196. if err != nil {
  2197. ctx.ServerError("Cloudbrain", err)
  2198. return
  2199. }
  2200. for i, task := range tasks {
  2201. tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
  2202. tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
  2203. if tasks[i].ComputeResource == "" {
  2204. tasks[i].ComputeResource = models.NPUResource
  2205. }
  2206. }
  2207. repoId := ctx.Repo.Repository.ID
  2208. Type := -1
  2209. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  2210. ListOptions: models.ListOptions{
  2211. Page: 1,
  2212. PageSize: 2,
  2213. },
  2214. RepoID: repoId,
  2215. Type: Type,
  2216. New: MODEL_LATEST,
  2217. })
  2218. ctx.Data["MODEL_COUNT"] = model_count
  2219. pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
  2220. pager.SetDefaultParams(ctx)
  2221. ctx.Data["Page"] = pager
  2222. ctx.Data["PageIsCloudBrain"] = true
  2223. ctx.Data["Tasks"] = tasks
  2224. ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
  2225. ctx.Data["RepoIsEmpty"] = repo.IsEmpty
  2226. ctx.HTML(200, tplModelArtsInferenceJobIndex)
  2227. }
  2228. func InferenceJobNew(ctx *context.Context) {
  2229. err := inferenceJobNewDataPrepare(ctx)
  2230. if err != nil {
  2231. ctx.ServerError("get new inference-job info failed", err)
  2232. return
  2233. }
  2234. ctx.HTML(200, tplModelArtsInferenceJobNew)
  2235. }
  2236. func inferenceJobNewDataPrepare(ctx *context.Context) error {
  2237. ctx.Data["PageIsCloudBrain"] = true
  2238. ctx.Data["newInference"] = true
  2239. t := time.Now()
  2240. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  2241. ctx.Data["display_job_name"] = displayJobName
  2242. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  2243. if err != nil {
  2244. ctx.ServerError("GetAllUserAttachments failed:", err)
  2245. return err
  2246. }
  2247. ctx.Data["attachments"] = attachs
  2248. var resourcePools modelarts.ResourcePool
  2249. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  2250. ctx.ServerError("json.Unmarshal failed:", err)
  2251. return err
  2252. }
  2253. ctx.Data["resource_pools"] = resourcePools.Info
  2254. var engines modelarts.Engine
  2255. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  2256. ctx.ServerError("json.Unmarshal failed:", err)
  2257. return err
  2258. }
  2259. ctx.Data["engines"] = engines.Info
  2260. var versionInfos modelarts.VersionInfo
  2261. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  2262. ctx.ServerError("json.Unmarshal failed:", err)
  2263. return err
  2264. }
  2265. ctx.Data["engine_versions"] = versionInfos.Version
  2266. prepareCloudbrainTwoInferenceSpecs(ctx)
  2267. ctx.Data["params"] = ""
  2268. ctx.Data["branchName"] = ctx.Repo.BranchName
  2269. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  2270. if err != nil {
  2271. ctx.ServerError("getConfigList failed:", err)
  2272. return err
  2273. }
  2274. ctx.Data["config_list"] = configList.ParaConfigs
  2275. repoId := ctx.Repo.Repository.ID
  2276. Type := -1
  2277. _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
  2278. ListOptions: models.ListOptions{
  2279. Page: 1,
  2280. PageSize: 2,
  2281. },
  2282. RepoID: repoId,
  2283. Type: Type,
  2284. New: MODEL_LATEST,
  2285. })
  2286. ctx.Data["MODEL_COUNT"] = model_count
  2287. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  2288. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  2289. ctx.Data["WaitCount"] = waitCount
  2290. return nil
  2291. }
  2292. func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) {
  2293. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  2294. JobType: models.JobTypeInference,
  2295. ComputeResource: models.NPU,
  2296. Cluster: models.OpenICluster,
  2297. AiCenterCode: models.AICenterOfCloudBrainTwo,
  2298. })
  2299. ctx.Data["Specs"] = noteBookSpecs
  2300. }
  2301. func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
  2302. ctx.Data["PageIsCloudBrain"] = true
  2303. t := time.Now()
  2304. var jobName = "inference" + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  2305. ctx.Data["job_name"] = jobName
  2306. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  2307. if err != nil {
  2308. ctx.ServerError("GetAllUserAttachments failed:", err)
  2309. return err
  2310. }
  2311. ctx.Data["attachments"] = attachs
  2312. var resourcePools modelarts.ResourcePool
  2313. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  2314. ctx.ServerError("json.Unmarshal failed:", err)
  2315. return err
  2316. }
  2317. ctx.Data["resource_pools"] = resourcePools.Info
  2318. var engines modelarts.Engine
  2319. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  2320. ctx.ServerError("json.Unmarshal failed:", err)
  2321. return err
  2322. }
  2323. ctx.Data["engines"] = engines.Info
  2324. var versionInfos modelarts.VersionInfo
  2325. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  2326. ctx.ServerError("json.Unmarshal failed:", err)
  2327. return err
  2328. }
  2329. ctx.Data["engine_versions"] = versionInfos.Version
  2330. prepareCloudbrainTwoInferenceSpecs(ctx)
  2331. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  2332. if err != nil {
  2333. ctx.ServerError("getConfigList failed:", err)
  2334. return err
  2335. }
  2336. var Parameters modelarts.Parameters
  2337. if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
  2338. ctx.ServerError("json.Unmarshal failed:", err)
  2339. return err
  2340. }
  2341. ctx.Data["params"] = Parameters.Parameter
  2342. ctx.Data["config_list"] = configList.ParaConfigs
  2343. ctx.Data["bootFile"] = form.BootFile
  2344. ctx.Data["uuid"] = form.Attachment
  2345. _, datasetNames, err := models.GetDatasetInfo(form.Attachment)
  2346. if err != nil {
  2347. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  2348. return nil
  2349. }
  2350. ctx.Data["dataset_name"] = datasetNames
  2351. ctx.Data["branch_name"] = form.BranchName
  2352. ctx.Data["model_name"] = form.ModelName
  2353. ctx.Data["model_version"] = form.ModelVersion
  2354. ctx.Data["ckpt_name"] = form.CkptName
  2355. ctx.Data["train_url"] = form.TrainUrl
  2356. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  2357. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  2358. ctx.Data["WaitCount"] = waitCount
  2359. return nil
  2360. }
  2361. func InferenceJobShow(ctx *context.Context) {
  2362. ctx.Data["PageIsCloudBrain"] = true
  2363. var jobID = ctx.Params(":jobid")
  2364. page := ctx.QueryInt("page")
  2365. if page <= 0 {
  2366. page = 1
  2367. }
  2368. task, err := models.GetCloudbrainByJobID(jobID)
  2369. if err != nil {
  2370. log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
  2371. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  2372. return
  2373. }
  2374. //设置权限
  2375. canNewJob, err := canUserCreateTrainJobVersion(ctx, task.UserID)
  2376. if err != nil {
  2377. ctx.ServerError("canNewJob failed", err)
  2378. return
  2379. }
  2380. ctx.Data["canNewJob"] = canNewJob
  2381. //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
  2382. var parameters models.Parameters
  2383. err = json.Unmarshal([]byte(task.Parameters), &parameters)
  2384. if err != nil {
  2385. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  2386. trainJobNewDataPrepare(ctx)
  2387. return
  2388. }
  2389. if len(parameters.Parameter) > 0 {
  2390. paramTemp := ""
  2391. for _, Parameter := range parameters.Parameter {
  2392. param := Parameter.Label + " = " + Parameter.Value + "; "
  2393. paramTemp = paramTemp + param
  2394. }
  2395. task.Parameters = paramTemp[:len(paramTemp)-2]
  2396. } else {
  2397. task.Parameters = ""
  2398. }
  2399. prepareSpec4Show(ctx, task)
  2400. LabelName := strings.Fields(task.LabelName)
  2401. ctx.Data["labelName"] = LabelName
  2402. ctx.Data["jobID"] = jobID
  2403. ctx.Data["jobName"] = task.JobName
  2404. ctx.Data["displayJobName"] = task.DisplayJobName
  2405. ctx.Data["task"] = task
  2406. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  2407. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  2408. tempUids := []int64{}
  2409. tempUids = append(tempUids, task.UserID)
  2410. JobCreater, err := models.GetUserNamesByIDs(tempUids)
  2411. if err != nil {
  2412. log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
  2413. }
  2414. ctx.Data["userName"] = JobCreater[0]
  2415. ctx.HTML(http.StatusOK, tplModelArtsInferenceJobShow)
  2416. }
  2417. func ModelDownload(ctx *context.Context) {
  2418. var (
  2419. err error
  2420. )
  2421. jobID := ctx.Params(":jobid")
  2422. versionName := ctx.Query("version_name")
  2423. parentDir := ctx.Query("parent_dir")
  2424. fileName := ctx.Query("file_name")
  2425. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2426. if err != nil {
  2427. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error())
  2428. return
  2429. }
  2430. var url string
  2431. if task.ComputeResource == models.NPUResource {
  2432. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
  2433. url, err = storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2434. if err != nil {
  2435. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2436. ctx.ServerError("GetObsCreateSignedUrl", err)
  2437. return
  2438. }
  2439. } else if task.ComputeResource == models.GPUResource {
  2440. filePath := setting.CBCodePathPrefix + task.JobName + cloudbrain.ModelMountPath + "/" + parentDir
  2441. url, err = storage.Attachments.PresignedGetURL(filePath, fileName)
  2442. if err != nil {
  2443. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  2444. ctx.ServerError("PresignedGetURL", err)
  2445. return
  2446. }
  2447. }
  2448. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2449. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2450. }
  2451. func ResultDownload(ctx *context.Context) {
  2452. var (
  2453. err error
  2454. )
  2455. versionName := ctx.Query("version_name")
  2456. parentDir := ctx.Query("parent_dir")
  2457. fileName := ctx.Query("file_name")
  2458. task := ctx.Cloudbrain
  2459. if err != nil {
  2460. ctx.Data["error"] = err.Error()
  2461. }
  2462. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
  2463. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
  2464. if err != nil {
  2465. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  2466. ctx.ServerError("GetObsCreateSignedUrl", err)
  2467. return
  2468. }
  2469. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2470. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2471. }
  2472. func DeleteJobStorage(jobName string) error {
  2473. //delete local
  2474. localJobPath := setting.JobPath + jobName
  2475. err := os.RemoveAll(localJobPath)
  2476. if err != nil {
  2477. log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
  2478. }
  2479. //delete oss
  2480. dirPath := setting.CodePathPrefix + jobName + "/"
  2481. err = storage.ObsRemoveObject(setting.Bucket, dirPath)
  2482. if err != nil {
  2483. log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
  2484. }
  2485. return nil
  2486. }
  2487. func DownloadMultiResultFile(ctx *context.Context) {
  2488. var jobID = ctx.Params(":jobid")
  2489. var versionName = ctx.Query("version_name")
  2490. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2491. if err != nil {
  2492. log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
  2493. return
  2494. }
  2495. // if !isCanDeleteOrDownload(ctx, task) {
  2496. // ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
  2497. // return
  2498. // }
  2499. // path := Model_prefix + models.AttachmentRelativePath(id) + "/"
  2500. path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"
  2501. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  2502. if err == nil {
  2503. //count++
  2504. // models.ModifyModelDownloadCount(id)
  2505. returnFileName := task.DisplayJobName + ".zip"
  2506. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+returnFileName)
  2507. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  2508. w := zip.NewWriter(ctx.Resp)
  2509. defer w.Close()
  2510. for _, oneFile := range allFile {
  2511. if oneFile.IsDir {
  2512. log.Info("zip dir name:" + oneFile.FileName)
  2513. } else {
  2514. log.Info("zip file name:" + oneFile.FileName)
  2515. fDest, err := w.Create(oneFile.FileName)
  2516. if err != nil {
  2517. log.Info("create zip entry error, download file failed: %s\n", err.Error())
  2518. ctx.ServerError("download file failed:", err)
  2519. return
  2520. }
  2521. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  2522. if err != nil {
  2523. log.Info("download file failed: %s\n", err.Error())
  2524. ctx.ServerError("download file failed:", err)
  2525. return
  2526. } else {
  2527. defer body.Close()
  2528. p := make([]byte, 1024)
  2529. var readErr error
  2530. var readCount int
  2531. // 读取对象内容
  2532. for {
  2533. readCount, readErr = body.Read(p)
  2534. if readCount > 0 {
  2535. fDest.Write(p[:readCount])
  2536. }
  2537. if readErr != nil {
  2538. break
  2539. }
  2540. }
  2541. }
  2542. }
  2543. }
  2544. } else {
  2545. log.Info("error,msg=" + err.Error())
  2546. ctx.ServerError("no file to download.", err)
  2547. }
  2548. }
  2549. func SetJobCount(ctx *context.Context) {
  2550. repoId := ctx.Repo.Repository.ID
  2551. _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{
  2552. RepoID: repoId,
  2553. Type: models.TypeCloudBrainAll,
  2554. })
  2555. if err != nil {
  2556. ctx.ServerError("Get job faild:", err)
  2557. return
  2558. }
  2559. ctx.Data["jobCount"] = jobCount
  2560. }
  2561. func TrainJobDownloadLogFile(ctx *context.Context) {
  2562. var (
  2563. err error
  2564. )
  2565. var jobID = ctx.Params(":jobid")
  2566. versionName := ctx.Query("version_name")
  2567. task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
  2568. if err != nil {
  2569. log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
  2570. ctx.ServerError("GetCloudbrainByJobIDAndVersionName", err)
  2571. return
  2572. }
  2573. prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
  2574. key, err := storage.GetObsLogFileName(prefix)
  2575. if err != nil {
  2576. log.Error("GetObsLogFileName(%s) failed:%v", jobID, err.Error(), ctx.Data["msgID"])
  2577. ctx.ServerError("GetObsLogFileName", err)
  2578. return
  2579. }
  2580. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, key)
  2581. if err != nil {
  2582. log.Error("GetObsCreateSignedUrlByBucketAndKey failed: %v", err.Error(), ctx.Data["msgID"])
  2583. ctx.ServerError("GetObsCreateSignedUrlByBucketAndKey", err)
  2584. return
  2585. }
  2586. ctx.Resp.Header().Set("Cache-Control", "max-age=0")
  2587. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
  2588. }
  2589. func getDatasUrlListByUUIDS(uuidStr string) ([]models.Datasurl, string, string, bool, error) {
  2590. var isMultiDataset bool
  2591. var dataUrl string
  2592. var datasetNames string
  2593. var datasUrlList []models.Datasurl
  2594. uuids := strings.Split(uuidStr, ";")
  2595. if len(uuids) > setting.MaxDatasetNum {
  2596. log.Error("the dataset count(%d) exceed the limit", len(uuids))
  2597. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("the dataset count exceed the limit")
  2598. }
  2599. datasetInfos := make(map[string]models.DatasetInfo)
  2600. attachs, err := models.GetAttachmentsByUUIDs(uuids)
  2601. if err != nil || len(attachs) != len(uuids) {
  2602. log.Error("GetAttachmentsByUUIDs failed: %v", err)
  2603. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("GetAttachmentsByUUIDs failed")
  2604. }
  2605. for i, tmpUuid := range uuids {
  2606. var attach *models.Attachment
  2607. for _, tmpAttach := range attachs {
  2608. if tmpAttach.UUID == tmpUuid {
  2609. attach = tmpAttach
  2610. break
  2611. }
  2612. }
  2613. if attach == nil {
  2614. log.Error("GetAttachmentsByUUIDs failed: %v", err)
  2615. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("GetAttachmentsByUUIDs failed")
  2616. }
  2617. fileName := strings.TrimSuffix(strings.TrimSuffix(strings.TrimSuffix(attach.Name, ".zip"), ".tar.gz"), ".tgz")
  2618. for _, datasetInfo := range datasetInfos {
  2619. if fileName == datasetInfo.Name {
  2620. log.Error("the dataset name is same: %v", attach.Name)
  2621. return datasUrlList, dataUrl, datasetNames, isMultiDataset, errors.New("the dataset name is same")
  2622. }
  2623. }
  2624. if len(attachs) <= 1 {
  2625. dataUrl = "/" + setting.Bucket + "/" + setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + attach.UUID + "/"
  2626. isMultiDataset = false
  2627. } else {
  2628. dataUrl = "/" + setting.Bucket + "/" + setting.BasePath + path.Join(attachs[0].UUID[0:1], attachs[0].UUID[1:2]) + "/" + attachs[0].UUID + attachs[0].UUID + "/"
  2629. datasetUrl := "s3://" + setting.Bucket + "/" + setting.BasePath + path.Join(attach.UUID[0:1], attach.UUID[1:2]) + "/" + attach.UUID + attach.UUID + "/"
  2630. datasUrlList = append(datasUrlList, models.Datasurl{
  2631. DatasetUrl: datasetUrl,
  2632. DatasetName: fileName,
  2633. })
  2634. isMultiDataset = true
  2635. }
  2636. if i == 0 {
  2637. datasetNames = attach.Name
  2638. } else {
  2639. datasetNames += ";" + attach.Name
  2640. }
  2641. }
  2642. return datasUrlList, dataUrl, datasetNames, isMultiDataset, nil
  2643. }