You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 60 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810
  1. package repo
  2. import (
  3. "code.gitea.io/gitea/services/lock"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "strconv"
  12. "strings"
  13. "code.gitea.io/gitea/modules/urfs_client/urchin"
  14. "code.gitea.io/gitea/routers/response"
  15. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  16. "code.gitea.io/gitea/modules/dataset"
  17. "code.gitea.io/gitea/services/cloudbrain/resource"
  18. "code.gitea.io/gitea/services/reward/point/account"
  19. "code.gitea.io/gitea/modules/auth"
  20. "code.gitea.io/gitea/modules/git"
  21. "code.gitea.io/gitea/modules/grampus"
  22. "code.gitea.io/gitea/modules/modelarts"
  23. "code.gitea.io/gitea/modules/notification"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "github.com/unknwon/com"
  27. "code.gitea.io/gitea/models"
  28. "code.gitea.io/gitea/modules/base"
  29. "code.gitea.io/gitea/modules/cloudbrain"
  30. "code.gitea.io/gitea/modules/context"
  31. "code.gitea.io/gitea/modules/log"
  32. "code.gitea.io/gitea/modules/setting"
  33. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  34. )
  35. const (
  36. tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
  37. tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
  38. //GPU
  39. tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
  40. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  41. //NPU
  42. tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
  43. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  44. //GCU
  45. tplGrampusNotebookGCUNew base.TplName = "repo/grampus/notebook/gcu/new"
  46. )
  47. func GrampusNotebookNew(ctx *context.Context) {
  48. ctx.Data["IsCreate"] = true
  49. notebookType := ctx.QueryInt("type")
  50. processType := grampus.ProcessorTypeGPU
  51. if notebookType == 1 {
  52. processType = grampus.ProcessorTypeNPU
  53. } else if notebookType == 2 {
  54. processType = grampus.ProcessorTypeGCU
  55. }
  56. err := grampusNotebookNewDataPrepare(ctx, processType)
  57. if err != nil {
  58. ctx.ServerError("get new notebook-job info failed", err)
  59. return
  60. }
  61. if processType == grampus.ProcessorTypeGPU {
  62. ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
  63. } else if processType == grampus.ProcessorTypeNPU {
  64. ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
  65. } else if processType == grampus.ProcessorTypeGCU {
  66. ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
  67. }
  68. }
  69. func GrampusTrainJobGPUNew(ctx *context.Context) {
  70. ctx.Data["IsCreate"] = true
  71. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  72. if err != nil {
  73. ctx.ServerError("get new train-job info failed", err)
  74. return
  75. }
  76. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  77. }
  78. func GrampusTrainJobNPUNew(ctx *context.Context) {
  79. ctx.Data["IsCreate"] = true
  80. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  81. if err != nil {
  82. ctx.ServerError("get new train-job info failed", err)
  83. return
  84. }
  85. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  86. }
  87. func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
  88. ctx.Data["IsCreate"] = true
  89. displayJobName := form.DisplayJobName
  90. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  91. uuid := form.Attachment
  92. description := form.Description
  93. repo := ctx.Repo.Repository
  94. branchName := form.BranchName
  95. image := strings.TrimSpace(form.Image)
  96. codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  97. tpl := tplGrampusNotebookGPUNew
  98. processType := grampus.ProcessorTypeGPU
  99. computeSource := models.GPUResource
  100. computeSourceSimple := models.GPU
  101. if form.Type == 1 {
  102. tpl = tplGrampusNotebookNPUNew
  103. processType = grampus.ProcessorTypeNPU
  104. computeSource = models.NPUResource
  105. computeSourceSimple = models.NPU
  106. codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
  107. } else if form.Type == 2 {
  108. tpl = tplGrampusNotebookGCUNew
  109. processType = grampus.ProcessorTypeGCU
  110. computeSource = models.GCUResource
  111. computeSourceSimple = models.GCU
  112. codeStoragePath = setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  113. }
  114. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug)}, User: ctx.User})
  115. defer func() {
  116. if lockOperator != nil {
  117. lockOperator.Unlock()
  118. }
  119. }()
  120. if errMsg != "" {
  121. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  122. grampusNotebookNewDataPrepare(ctx, processType)
  123. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  124. return
  125. }
  126. if !jobNamePattern.MatchString(displayJobName) {
  127. grampusNotebookNewDataPrepare(ctx, processType)
  128. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  129. return
  130. }
  131. //check count limit
  132. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
  133. if err != nil {
  134. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  135. grampusNotebookNewDataPrepare(ctx, processType)
  136. ctx.RenderWithErr("system error", tpl, &form)
  137. return
  138. } else {
  139. if count >= 1 {
  140. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  141. grampusNotebookNewDataPrepare(ctx, processType)
  142. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  143. return
  144. }
  145. }
  146. //check whether the task name in the project is duplicated
  147. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  148. if err == nil {
  149. if len(tasks) != 0 {
  150. log.Error("the job name did already exist", ctx.Data["MsgID"])
  151. grampusNotebookNewDataPrepare(ctx, processType)
  152. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  153. return
  154. }
  155. } else {
  156. if !models.IsErrJobNotExist(err) {
  157. log.Error("system error, %v", err, ctx.Data["MsgID"])
  158. grampusNotebookNewDataPrepare(ctx, processType)
  159. ctx.RenderWithErr("system error", tpl, &form)
  160. return
  161. }
  162. }
  163. //check specification
  164. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  165. JobType: models.JobTypeDebug,
  166. ComputeResource: computeSourceSimple,
  167. Cluster: models.C2NetCluster,
  168. })
  169. if err != nil || spec == nil {
  170. grampusNotebookNewDataPrepare(ctx, processType)
  171. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  172. return
  173. }
  174. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  175. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  176. grampusNotebookNewDataPrepare(ctx, processType)
  177. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
  178. return
  179. }
  180. var datasetInfos map[string]models.DatasetInfo
  181. var datasetNames string
  182. //var
  183. if uuid != "" {
  184. datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
  185. if err != nil {
  186. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  187. grampusNotebookNewDataPrepare(ctx, processType)
  188. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  189. return
  190. }
  191. }
  192. //prepare code and out path
  193. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  194. _, err = ioutil.ReadDir(codeLocalPath)
  195. if err == nil {
  196. os.RemoveAll(codeLocalPath)
  197. }
  198. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  199. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  200. grampusNotebookNewDataPrepare(ctx, processType)
  201. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  202. return
  203. }
  204. if processType == grampus.ProcessorTypeGPU || processType == grampus.ProcessorTypeGCU {
  205. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  206. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  207. grampusNotebookNewDataPrepare(ctx, processType)
  208. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  209. return
  210. }
  211. } else {
  212. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  213. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  214. grampusNotebookNewDataPrepare(ctx, processType)
  215. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  216. return
  217. }
  218. }
  219. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  220. req := &grampus.GenerateNotebookJobReq{
  221. JobName: jobName,
  222. DisplayJobName: displayJobName,
  223. ComputeResource: computeSource,
  224. ProcessType: processType,
  225. ImageUrl: image,
  226. ImageId: form.ImageID,
  227. Description: description,
  228. Uuid: uuid,
  229. CommitID: commitID,
  230. BranchName: branchName,
  231. DatasetNames: datasetNames,
  232. DatasetInfos: datasetInfos,
  233. Spec: spec,
  234. CodeStoragePath: codeStoragePath,
  235. CodeName: strings.ToLower(repo.Name),
  236. }
  237. if form.ModelName != "" { //使用预训练模型训练
  238. m, err := models.QueryModelByPath(form.PreTrainModelUrl)
  239. if err != nil {
  240. log.Error("Can not find model", err)
  241. grampusNotebookNewDataPrepare(ctx, processType)
  242. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
  243. return
  244. }
  245. req.ModelName = form.ModelName
  246. req.LabelName = form.LabelName
  247. req.CkptName = form.CkptName
  248. req.ModelVersion = form.ModelVersion
  249. req.PreTrainModelUrl = form.PreTrainModelUrl
  250. req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  251. req.ModelStorageType = m.Type
  252. }
  253. _, err = grampus.GenerateNotebookJob(ctx, req)
  254. if err != nil {
  255. log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
  256. grampusTrainJobNewDataPrepare(ctx, processType)
  257. ctx.RenderWithErr(err.Error(), tpl, &form)
  258. return
  259. }
  260. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  261. }
  262. func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
  263. ctx.Data["PageIsCloudBrain"] = true
  264. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  265. ctx.Data["display_job_name"] = displayJobName
  266. //get valid images
  267. if processType == grampus.ProcessorTypeNPU || processType == grampus.ProcessorTypeGCU {
  268. images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
  269. if err != nil {
  270. log.Error("GetImages failed:", err.Error())
  271. } else {
  272. ctx.Data["images"] = images.Infos
  273. }
  274. }
  275. //prepare available specs
  276. computeResourceSimple := models.GPU
  277. datasetType := models.TypeCloudBrainOne
  278. computeResource := models.GPUResource
  279. if processType == grampus.ProcessorTypeNPU {
  280. computeResourceSimple = models.NPU
  281. datasetType = models.TypeCloudBrainTwo
  282. computeResource = models.NPUResource
  283. } else if processType == grampus.ProcessorTypeGCU {
  284. computeResourceSimple = models.GCU
  285. datasetType = models.TypeCloudBrainAll
  286. computeResource = models.GCUResource
  287. }
  288. prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
  289. //get branches
  290. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  291. if err != nil {
  292. log.Error("GetBranches error:", err.Error())
  293. } else {
  294. ctx.Data["branches"] = branches
  295. }
  296. ctx.Data["branchName"] = ctx.Repo.BranchName
  297. ctx.Data["datasetType"] = datasetType
  298. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
  299. ctx.Data["WaitCount"] = waitCount
  300. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
  301. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  302. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  303. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  304. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  305. return nil
  306. }
  307. func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
  308. ctx.Data["PageIsCloudBrain"] = true
  309. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  310. ctx.Data["display_job_name"] = displayJobName
  311. //get valid images
  312. if processType == grampus.ProcessorTypeNPU {
  313. images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
  314. if err != nil {
  315. log.Error("GetImages failed:", err.Error())
  316. } else {
  317. ctx.Data["images"] = images.Infos
  318. }
  319. }
  320. //prepare available specs
  321. if processType == grampus.ProcessorTypeNPU {
  322. prepareGrampusSpecs(ctx, models.NPU)
  323. } else if processType == grampus.ProcessorTypeGPU {
  324. prepareGrampusSpecs(ctx, models.GPU)
  325. }
  326. //get branches
  327. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  328. if err != nil {
  329. log.Error("GetBranches error:", err.Error())
  330. } else {
  331. ctx.Data["branches"] = branches
  332. }
  333. ctx.Data["branchName"] = ctx.Repo.BranchName
  334. if processType == grampus.ProcessorTypeGPU {
  335. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  336. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
  337. ctx.Data["WaitCount"] = waitCount
  338. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  339. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  340. } else if processType == grampus.ProcessorTypeNPU {
  341. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  342. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
  343. ctx.Data["WaitCount"] = waitCount
  344. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  345. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  346. }
  347. if ctx.Cloudbrain != nil {
  348. uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid)
  349. ctx.Data["attachment"] = uuids
  350. ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
  351. ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
  352. ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
  353. ctx.Data["description"] = ctx.Cloudbrain.Description
  354. ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
  355. ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
  356. ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber
  357. if ctx.Cloudbrain.Image != "" {
  358. ctx.Data["image"] = ctx.Cloudbrain.Image
  359. } else {
  360. ctx.Data["image"] = ctx.Cloudbrain.EngineName
  361. }
  362. ctx.Data["dataset_name"] = datasetNames
  363. ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
  364. ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
  365. ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
  366. ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
  367. ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl
  368. spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
  369. if spec != nil {
  370. ctx.Data["spec_id"] = spec.ID
  371. }
  372. }
  373. return nil
  374. }
  375. func GrampusTrainJobVersionNew(ctx *context.Context) {
  376. task := ctx.Cloudbrain
  377. ctx.Data["IsCreate"] = false
  378. if task.ComputeResource == models.GPUResource {
  379. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  380. if err != nil {
  381. ctx.ServerError("get new train-job version info failed", err)
  382. return
  383. }
  384. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  385. } else if task.ComputeResource == models.NPUResource {
  386. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  387. if err != nil {
  388. ctx.ServerError("get new train-job version info failed", err)
  389. return
  390. }
  391. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  392. }
  393. }
  394. func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
  395. tempJobType := models.JobTypeTrain
  396. if len(jobType) > 0 {
  397. tempJobType = jobType[0]
  398. }
  399. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  400. JobType: tempJobType,
  401. ComputeResource: computeResource,
  402. Cluster: models.C2NetCluster,
  403. })
  404. ctx.Data["Specs"] = noteBookSpecs
  405. }
  406. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  407. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  408. log.Error("the boot file(%s) must be a python file", form.BootFile)
  409. return errors.New("启动文件必须是python文件")
  410. }
  411. if form.BranchName == "" {
  412. log.Error("the branch must not be null!", form.BranchName)
  413. return errors.New("代码分支不能为空!")
  414. }
  415. return nil
  416. }
  417. func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  418. ctx.Data["IsCreate"] = true
  419. grampusTrainJobGpuCreate(ctx, form)
  420. }
  421. func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  422. displayJobName := form.DisplayJobName
  423. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  424. uuid := form.Attachment
  425. description := form.Description
  426. bootFile := strings.TrimSpace(form.BootFile)
  427. params := form.Params
  428. repo := ctx.Repo.Repository
  429. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  430. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  431. branchName := form.BranchName
  432. image := strings.TrimSpace(form.Image)
  433. tpl := tplGrampusTrainJobGPUNew
  434. if !jobNamePattern.MatchString(displayJobName) {
  435. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  436. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  437. return
  438. }
  439. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeTrain)}, User: ctx.User})
  440. defer func() {
  441. if lockOperator != nil {
  442. lockOperator.Unlock()
  443. }
  444. }()
  445. if errMsg != "" {
  446. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  447. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  448. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  449. return
  450. }
  451. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  452. if err != nil || !bootFileExist {
  453. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  454. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  455. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  456. return
  457. }
  458. //check count limit
  459. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  460. if err != nil {
  461. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  462. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  463. ctx.RenderWithErr("system error", tpl, &form)
  464. return
  465. } else {
  466. if count >= 1 {
  467. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  468. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  469. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  470. return
  471. }
  472. }
  473. //check param
  474. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  475. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  476. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  477. ctx.RenderWithErr(err.Error(), tpl, &form)
  478. return
  479. }
  480. //check whether the task name in the project is duplicated
  481. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  482. if err == nil {
  483. if len(tasks) != 0 {
  484. log.Error("the job name did already exist", ctx.Data["MsgID"])
  485. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  486. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  487. return
  488. }
  489. } else {
  490. if !models.IsErrJobNotExist(err) {
  491. log.Error("system error, %v", err, ctx.Data["MsgID"])
  492. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  493. ctx.RenderWithErr("system error", tpl, &form)
  494. return
  495. }
  496. }
  497. //check specification
  498. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  499. JobType: models.JobTypeTrain,
  500. ComputeResource: models.GPU,
  501. Cluster: models.C2NetCluster,
  502. })
  503. if err != nil || spec == nil {
  504. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  505. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  506. return
  507. }
  508. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  509. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  510. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  511. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form)
  512. return
  513. }
  514. //check dataset
  515. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  516. if err != nil {
  517. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  518. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  519. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  520. return
  521. }
  522. //prepare code and out path
  523. _, err = ioutil.ReadDir(codeLocalPath)
  524. if err == nil {
  525. os.RemoveAll(codeLocalPath)
  526. }
  527. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  528. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  529. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  530. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  531. return
  532. }
  533. //todo: upload code (send to file_server todo this work?)
  534. //upload code
  535. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  536. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  537. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  538. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  539. return
  540. }
  541. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  542. if err := mkModelPath(modelPath); err != nil {
  543. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  544. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  545. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  546. return
  547. }
  548. //init model readme
  549. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  550. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  551. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  552. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  553. return
  554. }
  555. var datasetRemotePath, allFileName string
  556. for _, datasetInfo := range datasetInfos {
  557. if datasetRemotePath == "" {
  558. datasetRemotePath = datasetInfo.DataLocalPath
  559. allFileName = datasetInfo.FullName
  560. } else {
  561. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  562. allFileName = allFileName + ";" + datasetInfo.FullName
  563. }
  564. }
  565. //prepare command
  566. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  567. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "")
  568. if err != nil {
  569. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  570. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  571. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  572. return
  573. }
  574. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  575. req := &grampus.GenerateTrainJobReq{
  576. JobName: jobName,
  577. DisplayJobName: displayJobName,
  578. ComputeResource: models.GPUResource,
  579. ProcessType: grampus.ProcessorTypeGPU,
  580. Command: command,
  581. ImageUrl: image,
  582. Description: description,
  583. BootFile: bootFile,
  584. Uuid: uuid,
  585. CommitID: commitID,
  586. BranchName: branchName,
  587. Params: form.Params,
  588. EngineName: image,
  589. DatasetNames: datasetNames,
  590. DatasetInfos: datasetInfos,
  591. IsLatestVersion: modelarts.IsLatestVersion,
  592. VersionCount: modelarts.VersionCountOne,
  593. WorkServerNumber: 1,
  594. Spec: spec,
  595. }
  596. if form.ModelName != "" { //使用预训练模型训练
  597. req.ModelName = form.ModelName
  598. req.LabelName = form.LabelName
  599. req.CkptName = form.CkptName
  600. req.ModelVersion = form.ModelVersion
  601. req.PreTrainModelUrl = form.PreTrainModelUrl
  602. }
  603. _, err = grampus.GenerateTrainJob(ctx, req)
  604. if err != nil {
  605. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  606. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  607. ctx.RenderWithErr(err.Error(), tpl, &form)
  608. return
  609. }
  610. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  611. }
  612. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  613. index := strings.Index(pretrainModelDir, "/")
  614. if index > 0 {
  615. filterBucket := pretrainModelDir[index+1:]
  616. return filterBucket + fileName
  617. } else {
  618. return ""
  619. }
  620. }
  621. func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  622. ctx.Data["IsCreate"] = false
  623. computeResource := ctx.Query("compute_resource")
  624. if computeResource == models.GPUResource {
  625. grampusTrainJobGpuCreate(ctx, form)
  626. } else if computeResource == models.NPUResource {
  627. grampusTrainJobNpuCreate(ctx, form)
  628. } else {
  629. ctx.ServerError("resource error", errors.New("compute resource is not support"))
  630. return
  631. }
  632. }
  633. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  634. ctx.Data["IsCreate"] = true
  635. grampusTrainJobNpuCreate(ctx, form)
  636. }
  637. func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  638. displayJobName := form.DisplayJobName
  639. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  640. uuid := form.Attachment
  641. description := form.Description
  642. bootFile := strings.TrimSpace(form.BootFile)
  643. params := form.Params
  644. repo := ctx.Repo.Repository
  645. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  646. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  647. //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  648. branchName := form.BranchName
  649. isLatestVersion := modelarts.IsLatestVersion
  650. versionCount := modelarts.VersionCountOne
  651. engineName := form.EngineName
  652. tpl := tplGrampusTrainJobNPUNew
  653. if !jobNamePattern.MatchString(displayJobName) {
  654. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  655. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  656. return
  657. }
  658. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: displayJobName, JobType: string(models.JobTypeTrain)}, User: ctx.User})
  659. defer func() {
  660. if lockOperator != nil {
  661. lockOperator.Unlock()
  662. }
  663. }()
  664. if errMsg != "" {
  665. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  666. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  667. ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
  668. return
  669. }
  670. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  671. if err != nil || !bootFileExist {
  672. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  673. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  674. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  675. return
  676. }
  677. //check count limit
  678. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  679. if err != nil {
  680. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  681. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  682. ctx.RenderWithErr("system error", tpl, &form)
  683. return
  684. } else {
  685. if count >= 1 {
  686. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  687. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  688. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  689. return
  690. }
  691. }
  692. //check param
  693. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  694. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  695. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  696. ctx.RenderWithErr(err.Error(), tpl, &form)
  697. return
  698. }
  699. //check whether the task name in the project is duplicated
  700. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  701. if err == nil {
  702. if len(tasks) != 0 {
  703. log.Error("the job name did already exist", ctx.Data["MsgID"])
  704. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  705. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  706. return
  707. }
  708. } else {
  709. if !models.IsErrJobNotExist(err) {
  710. log.Error("system error, %v", err, ctx.Data["MsgID"])
  711. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  712. ctx.RenderWithErr("system error", tpl, &form)
  713. return
  714. }
  715. }
  716. //check specification
  717. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  718. JobType: models.JobTypeTrain,
  719. ComputeResource: models.NPU,
  720. Cluster: models.C2NetCluster,
  721. })
  722. if err != nil || spec == nil {
  723. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  724. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  725. return
  726. }
  727. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  728. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  729. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  730. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form)
  731. return
  732. }
  733. //check dataset
  734. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU)
  735. if err != nil {
  736. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  737. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  738. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  739. return
  740. }
  741. //prepare code and out path
  742. _, err = ioutil.ReadDir(codeLocalPath)
  743. if err == nil {
  744. os.RemoveAll(codeLocalPath)
  745. }
  746. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  747. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  748. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  749. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  750. return
  751. }
  752. //todo: upload code (send to file_server todo this work?)
  753. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  754. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  755. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  756. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  757. return
  758. }
  759. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  760. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  761. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  762. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  763. return
  764. }
  765. var datasetRemotePath, allFileName string
  766. for _, datasetInfo := range datasetInfos {
  767. if datasetRemotePath == "" {
  768. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  769. allFileName = datasetInfo.FullName
  770. } else {
  771. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  772. allFileName = allFileName + ";" + datasetInfo.FullName
  773. }
  774. }
  775. //prepare command
  776. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  777. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName))
  778. if err != nil {
  779. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  780. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  781. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  782. return
  783. }
  784. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  785. req := &grampus.GenerateTrainJobReq{
  786. JobName: jobName,
  787. DisplayJobName: displayJobName,
  788. ComputeResource: models.NPUResource,
  789. ProcessType: grampus.ProcessorTypeNPU,
  790. Command: command,
  791. ImageId: form.ImageID,
  792. Description: description,
  793. CodeObsPath: codeObsPath,
  794. BootFileUrl: codeObsPath + bootFile,
  795. BootFile: bootFile,
  796. WorkServerNumber: form.WorkServerNumber,
  797. Uuid: uuid,
  798. CommitID: commitID,
  799. IsLatestVersion: isLatestVersion,
  800. BranchName: branchName,
  801. Params: form.Params,
  802. EngineName: engineName,
  803. VersionCount: versionCount,
  804. TotalVersionCount: modelarts.TotalVersionCount,
  805. DatasetNames: datasetNames,
  806. DatasetInfos: datasetInfos,
  807. Spec: spec,
  808. CodeName: strings.ToLower(repo.Name),
  809. }
  810. if form.ModelName != "" { //使用预训练模型训练
  811. req.ModelName = form.ModelName
  812. req.LabelName = form.LabelName
  813. req.CkptName = form.CkptName
  814. req.ModelVersion = form.ModelVersion
  815. req.PreTrainModelUrl = form.PreTrainModelUrl
  816. req.PreTrainModelPath = preTrainModelPath
  817. }
  818. _, err = grampus.GenerateTrainJob(ctx, req)
  819. if err != nil {
  820. log.Error("GenerateTrainJob failed:%v", err.Error())
  821. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  822. ctx.RenderWithErr(err.Error(), tpl, &form)
  823. return
  824. }
  825. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  826. }
  827. func GetGrampusNotebook(ctx *context.APIContext) {
  828. var (
  829. err error
  830. )
  831. ID := ctx.Params(":id")
  832. job, err := models.GetCloudbrainByID(ID)
  833. if err != nil {
  834. ctx.NotFound("", err)
  835. log.Error("GetCloudbrainByID failed:", err)
  836. return
  837. }
  838. jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
  839. aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
  840. if err != nil {
  841. ctx.NotFound(err)
  842. log.Error("Sync cloud brain one status failed:", err)
  843. return
  844. }
  845. ctx.JSON(http.StatusOK, map[string]interface{}{
  846. "ID": ID,
  847. "JobName": jobAfter.JobName,
  848. "JobStatus": jobAfter.Status,
  849. "AiCenter": aiCenterName,
  850. "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
  851. "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
  852. "JobDuration": jobAfter.TrainJobDuration,
  853. })
  854. }
  855. func GrampusStopJob(ctx *context.Context) {
  856. var ID = ctx.Params(":id")
  857. var resultCode = "0"
  858. var errorMsg = ""
  859. var status = ""
  860. task := ctx.Cloudbrain
  861. for {
  862. if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
  863. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  864. resultCode = "-1"
  865. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  866. break
  867. }
  868. res, err := grampus.StopJob(task.JobID, task.JobType)
  869. if err != nil {
  870. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  871. resultCode = strconv.Itoa(res.ErrorCode)
  872. errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
  873. break
  874. }
  875. oldStatus := task.Status
  876. task.Status = getStopJobResponseStatus(res)
  877. if task.EndTime == 0 {
  878. task.EndTime = timeutil.TimeStampNow()
  879. }
  880. task.ComputeAndSetDuration()
  881. if oldStatus != task.Status {
  882. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  883. }
  884. err = models.UpdateJob(task)
  885. if err != nil {
  886. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  887. resultCode = "-1"
  888. errorMsg = "system error"
  889. break
  890. }
  891. status = task.Status
  892. break
  893. }
  894. ctx.JSON(200, map[string]interface{}{
  895. "result_code": resultCode,
  896. "error_msg": errorMsg,
  897. "status": status,
  898. "id": ID,
  899. "StatusOK": 0,
  900. })
  901. }
  902. func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
  903. newStatus := models.GrampusStatusStopping
  904. if res.Status != "" {
  905. newStatus = grampus.TransTrainJobStatus(res.Status)
  906. }
  907. return newStatus
  908. }
  909. func GrampusNotebookDel(ctx *context.Context) {
  910. var listType = ctx.Query("listType")
  911. if err := deleteGrampusJob(ctx); err != nil {
  912. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  913. ctx.ServerError(err.Error(), err)
  914. return
  915. }
  916. var isAdminPage = ctx.Query("isadminpage")
  917. var isHomePage = ctx.Query("ishomepage")
  918. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  919. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  920. } else if isHomePage == "true" {
  921. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  922. } else {
  923. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  924. }
  925. }
  926. func GrampusTrainJobDel(ctx *context.Context) {
  927. var listType = ctx.Query("listType")
  928. if err := deleteGrampusJob(ctx); err != nil {
  929. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  930. ctx.ServerError(err.Error(), err)
  931. return
  932. }
  933. var isAdminPage = ctx.Query("isadminpage")
  934. var isHomePage = ctx.Query("ishomepage")
  935. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  936. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  937. } else if isHomePage == "true" {
  938. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  939. } else {
  940. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  941. }
  942. }
  943. func deleteGrampusJob(ctx *context.Context) error {
  944. task := ctx.Cloudbrain
  945. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  946. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  947. return errors.New(ctx.Tr("cloudbrain.Not_Stopped"))
  948. }
  949. err := models.DeleteJob(task)
  950. if err != nil {
  951. log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
  952. return err
  953. }
  954. storageType := models.TypeCloudBrainOne
  955. if task.ComputeResource == models.NPUResource {
  956. storageType = models.TypeCloudBrainTwo
  957. }
  958. DeleteCloudbrainJobStorage(task.JobName, storageType)
  959. return nil
  960. }
  961. type NotebookDataset struct {
  962. DatasetUrl string `json:"dataset_url"`
  963. }
  964. func GrampusNotebookShow(ctx *context.Context) {
  965. ctx.Data["PageIsCloudBrain"] = true
  966. var task *models.Cloudbrain
  967. task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
  968. if err != nil {
  969. log.Error("GetCloudbrainByID failed:" + err.Error())
  970. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  971. return
  972. }
  973. task.ContainerIp = ""
  974. if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
  975. result, err := grampus.GetNotebookJob(task.JobID)
  976. if err != nil {
  977. log.Error("GetJob failed:" + err.Error())
  978. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  979. return
  980. }
  981. if result != nil {
  982. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  983. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  984. }
  985. oldStatus := task.Status
  986. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  987. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  988. task.Duration = result.JobInfo.RunSec
  989. if task.Duration < 0 {
  990. task.Duration = 0
  991. }
  992. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  993. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  994. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  995. }
  996. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  997. task.EndTime = task.StartTime.Add(task.Duration)
  998. }
  999. task.CorrectCreateUnix()
  1000. if oldStatus != task.Status {
  1001. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1002. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1003. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1004. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1005. }
  1006. }
  1007. }
  1008. }
  1009. err = models.UpdateJob(task)
  1010. if err != nil {
  1011. log.Error("UpdateJob failed:" + err.Error())
  1012. }
  1013. }
  1014. }
  1015. if len(task.Parameters) > 0 {
  1016. var parameters models.Parameters
  1017. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1018. if err != nil {
  1019. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1020. ctx.ServerError("system error", err)
  1021. return
  1022. }
  1023. if len(parameters.Parameter) > 0 {
  1024. paramTemp := ""
  1025. for _, Parameter := range parameters.Parameter {
  1026. param := Parameter.Label + " = " + Parameter.Value + "; "
  1027. paramTemp = paramTemp + param
  1028. }
  1029. task.Parameters = paramTemp[:len(paramTemp)-2]
  1030. } else {
  1031. task.Parameters = ""
  1032. }
  1033. }
  1034. user, err := models.GetUserByID(task.UserID)
  1035. if err == nil {
  1036. task.User = user
  1037. }
  1038. prepareSpec4Show(ctx, task)
  1039. ctx.Data["task"] = task
  1040. ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
  1041. ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
  1042. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1043. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1044. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  1045. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  1046. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  1047. ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
  1048. }
  1049. func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
  1050. datasetDownload := make([]*models.DatasetDownload, 0)
  1051. if ctx.IsSigned {
  1052. if task.Uuid != "" && task.UserID == ctx.User.ID {
  1053. if task.IsGPUTask() {
  1054. return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1055. } else {
  1056. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1057. datasetObsUrlList := make([]NotebookDataset, 0)
  1058. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1059. for _, datasetInfo := range datasetDownload {
  1060. for _, datasetObs := range datasetObsUrlList {
  1061. log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
  1062. if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
  1063. datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
  1064. break
  1065. }
  1066. }
  1067. }
  1068. }
  1069. }
  1070. }
  1071. return datasetDownload
  1072. }
  1073. func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
  1074. var modelDownload models.ModelDownload
  1075. if ctx.IsSigned {
  1076. if task.ModelName != "" && task.UserID == ctx.User.ID {
  1077. if task.IsNPUTask() {
  1078. modelDownload = models.ModelDownload{
  1079. Name: task.CkptName,
  1080. DownloadLink: "",
  1081. IsDelete: false,
  1082. }
  1083. if !HasModelFile(task) {
  1084. modelDownload.IsDelete = true
  1085. }
  1086. datasetObsUrlList := make([]NotebookDataset, 0)
  1087. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1088. for _, datasetObs := range datasetObsUrlList {
  1089. if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
  1090. modelDownload.DownloadLink = datasetObs.DatasetUrl
  1091. break
  1092. }
  1093. }
  1094. }
  1095. }
  1096. }
  1097. return &modelDownload
  1098. }
  1099. func GrampusTrainJobShow(ctx *context.Context) {
  1100. ctx.Data["PageIsCloudBrain"] = true
  1101. var task *models.Cloudbrain
  1102. task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
  1103. if err != nil {
  1104. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  1105. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1106. return
  1107. }
  1108. task.ContainerIp = ""
  1109. task.User, _ = models.GetUserByID(task.UserID)
  1110. if task.DeletedAt.IsZero() { //normal record
  1111. result, err := grampus.GetJob(task.JobID)
  1112. if err != nil {
  1113. log.Error("GetJob failed:" + err.Error())
  1114. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1115. return
  1116. }
  1117. if result != nil {
  1118. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1119. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1120. }
  1121. oldStatus := task.Status
  1122. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1123. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1124. task.Duration = result.JobInfo.RunSec
  1125. if task.Duration < 0 {
  1126. task.Duration = 0
  1127. }
  1128. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1129. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1130. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1131. }
  1132. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1133. task.EndTime = task.StartTime.Add(task.Duration)
  1134. }
  1135. task.CorrectCreateUnix()
  1136. if oldStatus != task.Status {
  1137. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1138. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1139. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1140. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1141. }
  1142. }
  1143. }
  1144. }
  1145. err = models.UpdateJob(task)
  1146. if err != nil {
  1147. log.Error("UpdateJob failed:" + err.Error())
  1148. }
  1149. }
  1150. }
  1151. if len(task.Parameters) > 0 {
  1152. var parameters models.Parameters
  1153. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1154. if err != nil {
  1155. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1156. ctx.ServerError("system error", err)
  1157. return
  1158. }
  1159. if len(parameters.Parameter) > 0 {
  1160. paramTemp := ""
  1161. for _, Parameter := range parameters.Parameter {
  1162. param := Parameter.Label + " = " + Parameter.Value + "; "
  1163. paramTemp = paramTemp + param
  1164. }
  1165. task.Parameters = paramTemp[:len(paramTemp)-2]
  1166. } else {
  1167. task.Parameters = ""
  1168. }
  1169. }
  1170. taskList := make([]*models.Cloudbrain, 0)
  1171. taskList = append(taskList, task)
  1172. prepareSpec4Show(ctx, task)
  1173. ctx.Data["version_list_task"] = taskList
  1174. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1175. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1176. ctx.Data["displayJobName"] = task.DisplayJobName
  1177. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1178. ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
  1179. }
  1180. func GrampusDownloadLog(ctx *context.Context) {
  1181. jobID := ctx.Params(":jobid")
  1182. job, err := models.GetCloudbrainByJobID(jobID)
  1183. if err != nil {
  1184. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1185. ctx.ServerError(err.Error(), err)
  1186. return
  1187. }
  1188. content, err := grampus.GetTrainJobLog(job.JobID)
  1189. if err != nil {
  1190. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1191. content = ""
  1192. }
  1193. fileName := job.JobName + "-log.txt"
  1194. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
  1195. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1196. var b []byte = []byte(content)
  1197. ctx.Resp.Write(b)
  1198. }
  1199. func GrampusGetLog(ctx *context.Context) {
  1200. jobID := ctx.Params(":jobid")
  1201. job, err := models.GetCloudbrainByJobID(jobID)
  1202. if err != nil {
  1203. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1204. ctx.ServerError(err.Error(), err)
  1205. return
  1206. }
  1207. content, err := grampus.GetTrainJobLog(job.JobID)
  1208. if err != nil {
  1209. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1210. ctx.JSON(http.StatusOK, map[string]interface{}{
  1211. "JobName": job.JobName,
  1212. "Content": "",
  1213. "CanLogDownload": false,
  1214. })
  1215. return
  1216. }
  1217. result, err := grampus.GetJob(jobID)
  1218. if err != nil {
  1219. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  1220. ctx.JSON(http.StatusOK, map[string]interface{}{
  1221. "JobName": job.JobName,
  1222. "Content": content,
  1223. "CanLogDownload": false,
  1224. })
  1225. return
  1226. }
  1227. if result != nil {
  1228. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1229. if job.Status == models.GrampusStatusFailed {
  1230. content = content + "\n" + result.ExitDiagnostics
  1231. }
  1232. }
  1233. canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
  1234. ctx.JSON(http.StatusOK, map[string]interface{}{
  1235. "JobName": job.JobName,
  1236. "Content": content,
  1237. "CanLogDownload": canLogDownload,
  1238. })
  1239. return
  1240. }
  1241. func GrampusMetrics(ctx *context.Context) {
  1242. jobID := ctx.Params(":jobid")
  1243. job, err := models.GetCloudbrainByJobID(jobID)
  1244. if err != nil {
  1245. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1246. ctx.ServerError(err.Error(), err)
  1247. return
  1248. }
  1249. result, err := grampus.GetGrampusMetrics(job.JobID)
  1250. if err != nil {
  1251. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1252. }
  1253. ctx.JSON(http.StatusOK, map[string]interface{}{
  1254. "JobID": jobID,
  1255. "Interval": result.Interval,
  1256. "MetricsInfo": result.MetricsInfo,
  1257. })
  1258. return
  1259. }
  1260. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
  1261. var command string
  1262. //prepare
  1263. workDir := grampus.NpuWorkDir
  1264. if processorType == grampus.ProcessorTypeNPU {
  1265. command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu
  1266. } else if processorType == grampus.ProcessorTypeGPU {
  1267. workDir = grampus.GpuWorkDir
  1268. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  1269. }
  1270. //download code & dataset
  1271. if processorType == grampus.ProcessorTypeNPU {
  1272. //no need to download code & dataset by internet
  1273. } else if processorType == grampus.ProcessorTypeGPU {
  1274. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  1275. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  1276. command += commandDownload
  1277. }
  1278. //unzip code & dataset
  1279. if processorType == grampus.ProcessorTypeNPU {
  1280. //no need to process
  1281. } else if processorType == grampus.ProcessorTypeGPU {
  1282. unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
  1283. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  1284. command += commandUnzip
  1285. }
  1286. command += "echo \"unzip finished;start to exec code;\";"
  1287. // set export
  1288. var commandExport string
  1289. if processorType == grampus.ProcessorTypeNPU {
  1290. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  1291. } else if processorType == grampus.ProcessorTypeGPU {
  1292. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  1293. }
  1294. command += commandExport
  1295. //exec code
  1296. var parameters models.Parameters
  1297. var paramCode string
  1298. if len(paramSrc) != 0 {
  1299. err := json.Unmarshal([]byte(paramSrc), &parameters)
  1300. if err != nil {
  1301. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  1302. return command, err
  1303. }
  1304. for _, parameter := range parameters.Parameter {
  1305. paramCode += " --" + parameter.Label + "=" + parameter.Value
  1306. }
  1307. }
  1308. var commandCode string
  1309. if processorType == grampus.ProcessorTypeNPU {
  1310. paramCode += " --model_url=" + modelRemoteObsUrl
  1311. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";"
  1312. } else if processorType == grampus.ProcessorTypeGPU {
  1313. if pretrainModelFileName != "" {
  1314. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  1315. }
  1316. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  1317. }
  1318. command += commandCode
  1319. //get exec result
  1320. commandGetRes := "result=$?;"
  1321. command += commandGetRes
  1322. //upload models
  1323. if processorType == grampus.ProcessorTypeNPU {
  1324. // no need to upload
  1325. } else if processorType == grampus.ProcessorTypeGPU {
  1326. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  1327. command += commandUpload
  1328. }
  1329. //check exec result
  1330. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  1331. command += commandCheckRes
  1332. return command, nil
  1333. }
  1334. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  1335. commandDownloadTemp := commandDownload
  1336. if pretrainModelPath != "" {
  1337. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  1338. }
  1339. commandDownloadTemp += ";"
  1340. return commandDownloadTemp
  1341. }
  1342. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  1343. archiveType := git.ZIP
  1344. archivePath := codePath
  1345. if !com.IsDir(archivePath) {
  1346. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  1347. log.Error("MkdirAll failed:" + err.Error())
  1348. return err
  1349. }
  1350. }
  1351. // Get corresponding commit.
  1352. var (
  1353. commit *git.Commit
  1354. err error
  1355. )
  1356. gitRepo := ctx.Repo.GitRepo
  1357. if err != nil {
  1358. log.Error("OpenRepository failed:" + err.Error())
  1359. return err
  1360. }
  1361. if gitRepo.IsBranchExist(branchName) {
  1362. commit, err = gitRepo.GetBranchCommit(branchName)
  1363. if err != nil {
  1364. log.Error("GetBranchCommit failed:" + err.Error())
  1365. return err
  1366. }
  1367. } else {
  1368. log.Error("the branch is not exist: " + branchName)
  1369. return fmt.Errorf("The branch does not exist.")
  1370. }
  1371. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  1372. if !com.IsFile(archivePath) {
  1373. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  1374. Format: archiveType,
  1375. Prefix: setting.Repository.PrefixArchiveFiles,
  1376. }); err != nil {
  1377. log.Error("CreateArchive failed:" + err.Error())
  1378. return err
  1379. }
  1380. }
  1381. return nil
  1382. }
  1383. func HandleTaskWithAiCenter(ctx *context.Context) {
  1384. log.Info("HandleTaskWithAiCenter start")
  1385. updateCounts := 0
  1386. cloudBrains, err := models.GetC2NetWithAiCenterWrongJob()
  1387. if err != nil {
  1388. log.Error("GetC2NetWithAiCenterWrongJob failed:" + err.Error())
  1389. return
  1390. }
  1391. if len(cloudBrains) == 0 {
  1392. log.Info("HandleC2NetWithAiCenterWrongJob:no task need handle")
  1393. return
  1394. }
  1395. cloudBrainCounts := len(cloudBrains)
  1396. for _, task := range cloudBrains {
  1397. result, err := grampus.GetJob(task.JobID)
  1398. if err != nil {
  1399. log.Error("GetJob failed:" + err.Error())
  1400. continue
  1401. }
  1402. if len(result.JobInfo.Tasks) != 0 {
  1403. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1404. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1405. }
  1406. err = models.UpdateJob(task)
  1407. if err != nil {
  1408. log.Error("UpdateJob failed:" + err.Error())
  1409. }
  1410. updateCounts++
  1411. }
  1412. }
  1413. r := make(map[string]interface{}, 0)
  1414. r["cloudBrainCounts"] = cloudBrainCounts
  1415. r["updateCounts"] = updateCounts
  1416. ctx.JSON(http.StatusOK, response.SuccessWithData(r))
  1417. }
  1418. func GrampusNotebookDebug(ctx *context.Context) {
  1419. result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
  1420. if err != nil {
  1421. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  1422. return
  1423. }
  1424. if len(result.JobInfo.Tasks) > 0 {
  1425. ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
  1426. return
  1427. }
  1428. ctx.NotFound("Can not find the job.", nil)
  1429. }
  1430. func GrampusNotebookRestart(ctx *context.Context) {
  1431. var id = ctx.Params(":id")
  1432. var resultCode = "-1"
  1433. var errorMsg = ""
  1434. var status = ""
  1435. var spec *models.Specification
  1436. task := ctx.Cloudbrain
  1437. if ctx.Written() {
  1438. return
  1439. }
  1440. lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{JobType: task.JobType}, User: ctx.User})
  1441. defer func() {
  1442. if lockOperator != nil {
  1443. lockOperator.Unlock()
  1444. }
  1445. }()
  1446. if errMsg != "" {
  1447. log.Error("lock processed failed:%s", errMsg, ctx.Data["MsgID"])
  1448. errorMsg = ctx.Tr(errMsg)
  1449. }
  1450. for {
  1451. if errorMsg != "" {
  1452. break
  1453. }
  1454. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  1455. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  1456. errorMsg = "the job is not stopped"
  1457. break
  1458. }
  1459. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
  1460. if err != nil {
  1461. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1462. errorMsg = "system error"
  1463. break
  1464. } else {
  1465. if count >= 1 {
  1466. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1467. resultCode = "2"
  1468. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  1469. break
  1470. }
  1471. }
  1472. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  1473. if err != nil || oldSpec == nil {
  1474. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  1475. errorMsg = "Resource specification not available"
  1476. break
  1477. }
  1478. computeSourceSimple := models.GPU
  1479. action := models.ActionCreateGrampusGPUDebugTask
  1480. if task.ComputeResource == models.NPUResource {
  1481. computeSourceSimple = models.NPU
  1482. action = models.ActionCreateGrampusNPUDebugTask
  1483. } else if task.ComputeResource == models.GCUResource {
  1484. computeSourceSimple = models.GCU
  1485. action = models.ActionCreateGrampusGCUDebugTask
  1486. }
  1487. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  1488. JobType: models.JobType(task.JobType),
  1489. ComputeResource: computeSourceSimple,
  1490. Cluster: models.C2NetCluster,
  1491. })
  1492. if err != nil || spec == nil {
  1493. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  1494. errorMsg = "Resource specification not support any more"
  1495. break
  1496. }
  1497. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1498. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1499. errorMsg = ctx.Tr("points.insufficient_points_balance")
  1500. break
  1501. }
  1502. if task.IsGPUTask() || task.IsGCUTask() {
  1503. if _, err := os.Stat(getOldJobPath(task)); err != nil {
  1504. log.Error("Can not find job minio path", err)
  1505. resultCode = "-1"
  1506. errorMsg = ctx.Tr("cloudbrain.result_cleared")
  1507. break
  1508. }
  1509. }
  1510. if !HasModelFile(task) { //使用预训练模型训练
  1511. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  1512. break
  1513. }
  1514. if hasDatasetDeleted(task) {
  1515. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  1516. break
  1517. }
  1518. createTime := timeutil.TimeStampNow()
  1519. res, err := grampus.RestartNotebookJob(task.JobID)
  1520. if err != nil {
  1521. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  1522. errorMsg = ctx.Tr("repo.debug_again_fail")
  1523. break
  1524. }
  1525. if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
  1526. log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
  1527. errorMsg = ctx.Tr("repo.debug_again_fail")
  1528. if res.GrampusResult.ErrorCode == 5005 {
  1529. errorMsg = ctx.Tr("repo.debug_again_fail_forever")
  1530. }
  1531. break
  1532. }
  1533. newTask := &models.Cloudbrain{
  1534. Status: res.Status,
  1535. UserID: task.UserID,
  1536. RepoID: task.RepoID,
  1537. JobID: res.NewId,
  1538. JobName: task.JobName,
  1539. DisplayJobName: task.DisplayJobName,
  1540. JobType: task.JobType,
  1541. Type: task.Type,
  1542. Uuid: task.Uuid,
  1543. Image: task.Image,
  1544. ImageID: task.ImageID,
  1545. EngineID: task.EngineID,
  1546. CommitID: task.CommitID,
  1547. EngineName: task.EngineName,
  1548. IsLatestVersion: "1",
  1549. BranchName: task.BranchName,
  1550. DatasetName: task.DatasetName,
  1551. ComputeResource: task.ComputeResource,
  1552. Description: task.Description,
  1553. CreatedUnix: createTime,
  1554. UpdatedUnix: createTime,
  1555. Spec: spec,
  1556. ModelName: task.ModelName,
  1557. ModelVersion: task.ModelVersion,
  1558. LabelName: task.LabelName,
  1559. PreTrainModelUrl: task.PreTrainModelUrl,
  1560. CkptName: task.CkptName,
  1561. WorkServerNumber: 1,
  1562. }
  1563. err = models.RestartCloudbrain(task, newTask)
  1564. if err != nil {
  1565. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  1566. errorMsg = "system error"
  1567. break
  1568. }
  1569. id = strconv.FormatInt(newTask.ID, 10)
  1570. status = res.Status
  1571. resultCode = "0"
  1572. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
  1573. break
  1574. }
  1575. ctx.JSON(200, map[string]string{
  1576. "result_code": resultCode,
  1577. "error_msg": errorMsg,
  1578. "status": status,
  1579. "id": id,
  1580. })
  1581. }