You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 60 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795
  1. package repo
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "io/ioutil"
  7. "net/http"
  8. "os"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/urfs_client/urchin"
  13. "code.gitea.io/gitea/routers/response"
  14. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  15. "code.gitea.io/gitea/modules/dataset"
  16. "code.gitea.io/gitea/services/cloudbrain/resource"
  17. "code.gitea.io/gitea/services/reward/point/account"
  18. "code.gitea.io/gitea/modules/auth"
  19. "code.gitea.io/gitea/modules/git"
  20. "code.gitea.io/gitea/modules/grampus"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/notification"
  23. "code.gitea.io/gitea/modules/redis/redis_key"
  24. "code.gitea.io/gitea/modules/redis/redis_lock"
  25. "code.gitea.io/gitea/modules/timeutil"
  26. "code.gitea.io/gitea/modules/util"
  27. "github.com/unknwon/com"
  28. "code.gitea.io/gitea/models"
  29. "code.gitea.io/gitea/modules/base"
  30. "code.gitea.io/gitea/modules/cloudbrain"
  31. "code.gitea.io/gitea/modules/context"
  32. "code.gitea.io/gitea/modules/log"
  33. "code.gitea.io/gitea/modules/setting"
  34. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  35. )
  36. const (
  37. tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
  38. tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
  39. //GPU
  40. tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
  41. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  42. //NPU
  43. tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
  44. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  45. //GCU
  46. tplGrampusNotebookGCUNew base.TplName = "repo/grampus/notebook/gcu/new"
  47. )
  48. func GrampusNotebookNew(ctx *context.Context) {
  49. ctx.Data["IsCreate"] = true
  50. notebookType := ctx.QueryInt("type")
  51. processType := grampus.ProcessorTypeGPU
  52. if notebookType == 1 {
  53. processType = grampus.ProcessorTypeNPU
  54. } else if notebookType == 2 {
  55. processType = grampus.ProcessorTypeGCU
  56. }
  57. err := grampusNotebookNewDataPrepare(ctx, processType)
  58. if err != nil {
  59. ctx.ServerError("get new notebook-job info failed", err)
  60. return
  61. }
  62. if processType == grampus.ProcessorTypeGPU {
  63. ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
  64. } else if processType == grampus.ProcessorTypeNPU {
  65. ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
  66. } else if processType == grampus.ProcessorTypeGCU {
  67. ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
  68. }
  69. }
  70. func GrampusTrainJobGPUNew(ctx *context.Context) {
  71. ctx.Data["IsCreate"] = true
  72. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  73. if err != nil {
  74. ctx.ServerError("get new train-job info failed", err)
  75. return
  76. }
  77. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  78. }
  79. func GrampusTrainJobNPUNew(ctx *context.Context) {
  80. ctx.Data["IsCreate"] = true
  81. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  82. if err != nil {
  83. ctx.ServerError("get new train-job info failed", err)
  84. return
  85. }
  86. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  87. }
  88. func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
  89. ctx.Data["IsCreate"] = true
  90. displayJobName := form.DisplayJobName
  91. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  92. uuid := form.Attachment
  93. description := form.Description
  94. repo := ctx.Repo.Repository
  95. branchName := form.BranchName
  96. image := strings.TrimSpace(form.Image)
  97. codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  98. tpl := tplGrampusNotebookGPUNew
  99. processType := grampus.ProcessorTypeGPU
  100. computeSource := models.GPUResource
  101. computeSourceSimple := models.GPU
  102. if form.Type == 1 {
  103. tpl = tplGrampusNotebookNPUNew
  104. processType = grampus.ProcessorTypeNPU
  105. computeSource = models.NPUResource
  106. computeSourceSimple = models.NPU
  107. codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
  108. } else if form.Type == 2 {
  109. tpl = tplGrampusNotebookGCUNew
  110. processType = grampus.ProcessorTypeGCU
  111. computeSource = models.GCUResource
  112. computeSourceSimple = models.GCU
  113. codeStoragePath = setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  114. }
  115. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName))
  116. defer lock.UnLock()
  117. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  118. if !isOk {
  119. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  120. grampusNotebookNewDataPrepare(ctx, processType)
  121. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
  122. return
  123. }
  124. if !jobNamePattern.MatchString(displayJobName) {
  125. grampusNotebookNewDataPrepare(ctx, processType)
  126. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  127. return
  128. }
  129. //check count limit
  130. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
  131. if err != nil {
  132. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  133. grampusNotebookNewDataPrepare(ctx, processType)
  134. ctx.RenderWithErr("system error", tpl, &form)
  135. return
  136. } else {
  137. if count >= 1 {
  138. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  139. grampusNotebookNewDataPrepare(ctx, processType)
  140. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  141. return
  142. }
  143. }
  144. //check whether the task name in the project is duplicated
  145. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  146. if err == nil {
  147. if len(tasks) != 0 {
  148. log.Error("the job name did already exist", ctx.Data["MsgID"])
  149. grampusNotebookNewDataPrepare(ctx, processType)
  150. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  151. return
  152. }
  153. } else {
  154. if !models.IsErrJobNotExist(err) {
  155. log.Error("system error, %v", err, ctx.Data["MsgID"])
  156. grampusNotebookNewDataPrepare(ctx, processType)
  157. ctx.RenderWithErr("system error", tpl, &form)
  158. return
  159. }
  160. }
  161. //check specification
  162. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  163. JobType: models.JobTypeDebug,
  164. ComputeResource: computeSourceSimple,
  165. Cluster: models.C2NetCluster,
  166. })
  167. if err != nil || spec == nil {
  168. grampusNotebookNewDataPrepare(ctx, processType)
  169. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  170. return
  171. }
  172. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  173. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  174. grampusNotebookNewDataPrepare(ctx, processType)
  175. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
  176. return
  177. }
  178. var datasetInfos map[string]models.DatasetInfo
  179. var datasetNames string
  180. //var
  181. if uuid != "" {
  182. datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
  183. if err != nil {
  184. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  185. grampusNotebookNewDataPrepare(ctx, processType)
  186. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  187. return
  188. }
  189. uuidArray := strings.Split(uuid, ";")
  190. if datasetInfos == nil || len(datasetInfos) < len(uuidArray) {
  191. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.partial_datasets_not_available"), tpl, &form)
  192. return
  193. }
  194. }
  195. //prepare code and out path
  196. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  197. _, err = ioutil.ReadDir(codeLocalPath)
  198. if err == nil {
  199. os.RemoveAll(codeLocalPath)
  200. }
  201. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  202. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  203. grampusNotebookNewDataPrepare(ctx, processType)
  204. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  205. return
  206. }
  207. if processType == grampus.ProcessorTypeGPU || processType == grampus.ProcessorTypeGCU {
  208. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  209. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  210. grampusNotebookNewDataPrepare(ctx, processType)
  211. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  212. return
  213. }
  214. } else {
  215. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  216. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  217. grampusNotebookNewDataPrepare(ctx, processType)
  218. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  219. return
  220. }
  221. }
  222. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  223. req := &grampus.GenerateNotebookJobReq{
  224. JobName: jobName,
  225. DisplayJobName: displayJobName,
  226. ComputeResource: computeSource,
  227. ProcessType: processType,
  228. ImageUrl: image,
  229. ImageId: form.ImageID,
  230. Description: description,
  231. Uuid: uuid,
  232. CommitID: commitID,
  233. BranchName: branchName,
  234. DatasetNames: datasetNames,
  235. DatasetInfos: datasetInfos,
  236. Spec: spec,
  237. CodeStoragePath: codeStoragePath,
  238. CodeName: strings.ToLower(repo.Name),
  239. }
  240. if form.ModelName != "" { //使用预训练模型训练
  241. m, err := models.QueryModelByPath(form.PreTrainModelUrl)
  242. if err != nil {
  243. log.Error("Can not find model", err)
  244. grampusNotebookNewDataPrepare(ctx, processType)
  245. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
  246. return
  247. }
  248. if !cloudbrainTask.IsModelFileExists(m, form.CkptName) {
  249. log.Error("model file not exist.name = %s", form.CkptName)
  250. grampusNotebookNewDataPrepare(ctx, processType)
  251. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_file_not_exist"), tpl, &form)
  252. return
  253. }
  254. req.ModelName = form.ModelName
  255. req.LabelName = form.LabelName
  256. req.CkptName = form.CkptName
  257. req.ModelVersion = form.ModelVersion
  258. req.PreTrainModelUrl = form.PreTrainModelUrl
  259. req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  260. req.ModelStorageType = m.Type
  261. }
  262. _, err = grampus.GenerateNotebookJob(ctx, req)
  263. if err != nil {
  264. log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
  265. grampusTrainJobNewDataPrepare(ctx, processType)
  266. ctx.RenderWithErr(err.Error(), tpl, &form)
  267. return
  268. }
  269. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  270. }
  271. func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
  272. ctx.Data["PageIsCloudBrain"] = true
  273. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  274. ctx.Data["display_job_name"] = displayJobName
  275. //get valid images
  276. if processType == grampus.ProcessorTypeNPU || processType == grampus.ProcessorTypeGCU {
  277. images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
  278. if err != nil {
  279. log.Error("GetImages failed:", err.Error())
  280. } else {
  281. ctx.Data["images"] = images.Infos
  282. }
  283. }
  284. //prepare available specs
  285. computeResourceSimple := models.GPU
  286. datasetType := models.TypeCloudBrainOne
  287. computeResource := models.GPUResource
  288. if processType == grampus.ProcessorTypeNPU {
  289. computeResourceSimple = models.NPU
  290. datasetType = models.TypeCloudBrainTwo
  291. computeResource = models.NPUResource
  292. } else if processType == grampus.ProcessorTypeGCU {
  293. computeResourceSimple = models.GCU
  294. datasetType = models.TypeCloudBrainAll
  295. computeResource = models.GCUResource
  296. }
  297. prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
  298. //get branches
  299. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  300. if err != nil {
  301. log.Error("GetBranches error:", err.Error())
  302. } else {
  303. ctx.Data["branches"] = branches
  304. }
  305. ctx.Data["branchName"] = ctx.Repo.BranchName
  306. ctx.Data["datasetType"] = datasetType
  307. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
  308. ctx.Data["WaitCount"] = waitCount
  309. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
  310. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  311. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  312. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  313. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  314. return nil
  315. }
  316. func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
  317. ctx.Data["PageIsCloudBrain"] = true
  318. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  319. ctx.Data["display_job_name"] = displayJobName
  320. //get valid images
  321. if processType == grampus.ProcessorTypeNPU {
  322. images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
  323. if err != nil {
  324. log.Error("GetImages failed:", err.Error())
  325. } else {
  326. ctx.Data["images"] = images.Infos
  327. }
  328. }
  329. //prepare available specs
  330. if processType == grampus.ProcessorTypeNPU {
  331. prepareGrampusSpecs(ctx, models.NPU)
  332. } else if processType == grampus.ProcessorTypeGPU {
  333. prepareGrampusSpecs(ctx, models.GPU)
  334. }
  335. //get branches
  336. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  337. if err != nil {
  338. log.Error("GetBranches error:", err.Error())
  339. } else {
  340. ctx.Data["branches"] = branches
  341. }
  342. ctx.Data["branchName"] = ctx.Repo.BranchName
  343. if processType == grampus.ProcessorTypeGPU {
  344. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  345. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
  346. ctx.Data["WaitCount"] = waitCount
  347. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  348. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  349. } else if processType == grampus.ProcessorTypeNPU {
  350. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  351. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
  352. ctx.Data["WaitCount"] = waitCount
  353. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  354. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  355. }
  356. if ctx.Cloudbrain != nil {
  357. uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid)
  358. ctx.Data["attachment"] = uuids
  359. ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
  360. ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
  361. ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
  362. ctx.Data["description"] = ctx.Cloudbrain.Description
  363. ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
  364. ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
  365. ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber
  366. if ctx.Cloudbrain.Image != "" {
  367. ctx.Data["image"] = ctx.Cloudbrain.Image
  368. } else {
  369. ctx.Data["image"] = ctx.Cloudbrain.EngineName
  370. }
  371. ctx.Data["dataset_name"] = datasetNames
  372. ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
  373. ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
  374. ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
  375. ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
  376. ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl
  377. spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
  378. if spec != nil {
  379. ctx.Data["spec_id"] = spec.ID
  380. }
  381. }
  382. return nil
  383. }
  384. func GrampusTrainJobVersionNew(ctx *context.Context) {
  385. task := ctx.Cloudbrain
  386. ctx.Data["IsCreate"] = false
  387. if task.ComputeResource == models.GPUResource {
  388. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  389. if err != nil {
  390. ctx.ServerError("get new train-job version info failed", err)
  391. return
  392. }
  393. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  394. } else if task.ComputeResource == models.NPUResource {
  395. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  396. if err != nil {
  397. ctx.ServerError("get new train-job version info failed", err)
  398. return
  399. }
  400. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  401. }
  402. }
  403. func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
  404. tempJobType := models.JobTypeTrain
  405. if len(jobType) > 0 {
  406. tempJobType = jobType[0]
  407. }
  408. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  409. JobType: tempJobType,
  410. ComputeResource: computeResource,
  411. Cluster: models.C2NetCluster,
  412. })
  413. ctx.Data["Specs"] = noteBookSpecs
  414. }
  415. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  416. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  417. log.Error("the boot file(%s) must be a python file", form.BootFile)
  418. return errors.New("启动文件必须是python文件")
  419. }
  420. if form.BranchName == "" {
  421. log.Error("the branch must not be null!", form.BranchName)
  422. return errors.New("代码分支不能为空!")
  423. }
  424. return nil
  425. }
  426. func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  427. ctx.Data["IsCreate"] = true
  428. grampusTrainJobGpuCreate(ctx, form)
  429. }
  430. func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  431. displayJobName := form.DisplayJobName
  432. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  433. uuid := form.Attachment
  434. description := form.Description
  435. bootFile := strings.TrimSpace(form.BootFile)
  436. params := form.Params
  437. repo := ctx.Repo.Repository
  438. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  439. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  440. branchName := form.BranchName
  441. image := strings.TrimSpace(form.Image)
  442. tpl := tplGrampusTrainJobGPUNew
  443. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  444. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  445. if !isOk {
  446. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  447. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  448. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobGPUNew, &form)
  449. return
  450. }
  451. defer lock.UnLock()
  452. if !jobNamePattern.MatchString(displayJobName) {
  453. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  454. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  455. return
  456. }
  457. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  458. if err != nil || !bootFileExist {
  459. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  460. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  461. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  462. return
  463. }
  464. //check count limit
  465. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  466. if err != nil {
  467. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  468. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  469. ctx.RenderWithErr("system error", tpl, &form)
  470. return
  471. } else {
  472. if count >= 1 {
  473. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  474. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  475. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  476. return
  477. }
  478. }
  479. //check param
  480. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  481. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  482. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  483. ctx.RenderWithErr(err.Error(), tpl, &form)
  484. return
  485. }
  486. //check whether the task name in the project is duplicated
  487. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  488. if err == nil {
  489. if len(tasks) != 0 {
  490. log.Error("the job name did already exist", ctx.Data["MsgID"])
  491. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  492. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  493. return
  494. }
  495. } else {
  496. if !models.IsErrJobNotExist(err) {
  497. log.Error("system error, %v", err, ctx.Data["MsgID"])
  498. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  499. ctx.RenderWithErr("system error", tpl, &form)
  500. return
  501. }
  502. }
  503. //check specification
  504. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  505. JobType: models.JobTypeTrain,
  506. ComputeResource: models.GPU,
  507. Cluster: models.C2NetCluster,
  508. })
  509. if err != nil || spec == nil {
  510. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  511. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  512. return
  513. }
  514. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  515. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  516. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  517. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form)
  518. return
  519. }
  520. //check dataset
  521. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  522. if err != nil {
  523. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  524. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  525. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  526. return
  527. }
  528. //prepare code and out path
  529. _, err = ioutil.ReadDir(codeLocalPath)
  530. if err == nil {
  531. os.RemoveAll(codeLocalPath)
  532. }
  533. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  534. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  535. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  536. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  537. return
  538. }
  539. //todo: upload code (send to file_server todo this work?)
  540. //upload code
  541. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  542. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  543. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  544. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  545. return
  546. }
  547. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  548. if err := mkModelPath(modelPath); err != nil {
  549. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  550. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  551. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  552. return
  553. }
  554. //init model readme
  555. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  556. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  557. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  558. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  559. return
  560. }
  561. var datasetRemotePath, allFileName string
  562. for _, datasetInfo := range datasetInfos {
  563. if datasetRemotePath == "" {
  564. datasetRemotePath = datasetInfo.DataLocalPath
  565. allFileName = datasetInfo.FullName
  566. } else {
  567. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  568. allFileName = allFileName + ";" + datasetInfo.FullName
  569. }
  570. }
  571. //prepare command
  572. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  573. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "")
  574. if err != nil {
  575. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  576. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  577. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  578. return
  579. }
  580. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  581. req := &grampus.GenerateTrainJobReq{
  582. JobName: jobName,
  583. DisplayJobName: displayJobName,
  584. ComputeResource: models.GPUResource,
  585. ProcessType: grampus.ProcessorTypeGPU,
  586. Command: command,
  587. ImageUrl: image,
  588. Description: description,
  589. BootFile: bootFile,
  590. Uuid: uuid,
  591. CommitID: commitID,
  592. BranchName: branchName,
  593. Params: form.Params,
  594. EngineName: image,
  595. DatasetNames: datasetNames,
  596. DatasetInfos: datasetInfos,
  597. IsLatestVersion: modelarts.IsLatestVersion,
  598. VersionCount: modelarts.VersionCountOne,
  599. WorkServerNumber: 1,
  600. Spec: spec,
  601. }
  602. if form.ModelName != "" { //使用预训练模型训练
  603. req.ModelName = form.ModelName
  604. req.LabelName = form.LabelName
  605. req.CkptName = form.CkptName
  606. req.ModelVersion = form.ModelVersion
  607. req.PreTrainModelUrl = form.PreTrainModelUrl
  608. }
  609. _, err = grampus.GenerateTrainJob(ctx, req)
  610. if err != nil {
  611. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  612. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  613. ctx.RenderWithErr(err.Error(), tpl, &form)
  614. return
  615. }
  616. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  617. }
  618. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  619. index := strings.Index(pretrainModelDir, "/")
  620. if index > 0 {
  621. filterBucket := pretrainModelDir[index+1:]
  622. return filterBucket + fileName
  623. } else {
  624. return ""
  625. }
  626. }
  627. func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  628. ctx.Data["IsCreate"] = false
  629. computeResource := ctx.Query("compute_resource")
  630. if computeResource == models.GPUResource {
  631. grampusTrainJobGpuCreate(ctx, form)
  632. } else if computeResource == models.NPUResource {
  633. grampusTrainJobNpuCreate(ctx, form)
  634. } else {
  635. ctx.ServerError("resource error", errors.New("compute resource is not support"))
  636. return
  637. }
  638. }
  639. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  640. ctx.Data["IsCreate"] = true
  641. grampusTrainJobNpuCreate(ctx, form)
  642. }
  643. func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  644. displayJobName := form.DisplayJobName
  645. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  646. uuid := form.Attachment
  647. description := form.Description
  648. bootFile := strings.TrimSpace(form.BootFile)
  649. params := form.Params
  650. repo := ctx.Repo.Repository
  651. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  652. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  653. //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  654. branchName := form.BranchName
  655. isLatestVersion := modelarts.IsLatestVersion
  656. versionCount := modelarts.VersionCountOne
  657. engineName := form.EngineName
  658. tpl := tplGrampusTrainJobNPUNew
  659. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  660. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  661. if !isOk {
  662. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  663. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  664. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobNPUNew, &form)
  665. return
  666. }
  667. defer lock.UnLock()
  668. if !jobNamePattern.MatchString(displayJobName) {
  669. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  670. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  671. return
  672. }
  673. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  674. if err != nil || !bootFileExist {
  675. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  676. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  677. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  678. return
  679. }
  680. //check count limit
  681. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  682. if err != nil {
  683. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  684. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  685. ctx.RenderWithErr("system error", tpl, &form)
  686. return
  687. } else {
  688. if count >= 1 {
  689. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  690. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  691. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  692. return
  693. }
  694. }
  695. //check param
  696. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  697. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  698. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  699. ctx.RenderWithErr(err.Error(), tpl, &form)
  700. return
  701. }
  702. //check whether the task name in the project is duplicated
  703. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  704. if err == nil {
  705. if len(tasks) != 0 {
  706. log.Error("the job name did already exist", ctx.Data["MsgID"])
  707. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  708. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  709. return
  710. }
  711. } else {
  712. if !models.IsErrJobNotExist(err) {
  713. log.Error("system error, %v", err, ctx.Data["MsgID"])
  714. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  715. ctx.RenderWithErr("system error", tpl, &form)
  716. return
  717. }
  718. }
  719. //check specification
  720. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  721. JobType: models.JobTypeTrain,
  722. ComputeResource: models.NPU,
  723. Cluster: models.C2NetCluster,
  724. })
  725. if err != nil || spec == nil {
  726. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  727. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  728. return
  729. }
  730. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  731. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  732. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  733. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form)
  734. return
  735. }
  736. //check dataset
  737. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU)
  738. if err != nil {
  739. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  740. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  741. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  742. return
  743. }
  744. //prepare code and out path
  745. _, err = ioutil.ReadDir(codeLocalPath)
  746. if err == nil {
  747. os.RemoveAll(codeLocalPath)
  748. }
  749. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  750. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  751. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  752. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  753. return
  754. }
  755. //todo: upload code (send to file_server todo this work?)
  756. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  757. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  758. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  759. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  760. return
  761. }
  762. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  763. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  764. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  765. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  766. return
  767. }
  768. var datasetRemotePath, allFileName string
  769. for _, datasetInfo := range datasetInfos {
  770. if datasetRemotePath == "" {
  771. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  772. allFileName = datasetInfo.FullName
  773. } else {
  774. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  775. allFileName = allFileName + ";" + datasetInfo.FullName
  776. }
  777. }
  778. //prepare command
  779. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  780. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName))
  781. if err != nil {
  782. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  783. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  784. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  785. return
  786. }
  787. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  788. req := &grampus.GenerateTrainJobReq{
  789. JobName: jobName,
  790. DisplayJobName: displayJobName,
  791. ComputeResource: models.NPUResource,
  792. ProcessType: grampus.ProcessorTypeNPU,
  793. Command: command,
  794. ImageId: form.ImageID,
  795. Description: description,
  796. CodeObsPath: codeObsPath,
  797. BootFileUrl: codeObsPath + bootFile,
  798. BootFile: bootFile,
  799. WorkServerNumber: form.WorkServerNumber,
  800. Uuid: uuid,
  801. CommitID: commitID,
  802. IsLatestVersion: isLatestVersion,
  803. BranchName: branchName,
  804. Params: form.Params,
  805. EngineName: engineName,
  806. VersionCount: versionCount,
  807. TotalVersionCount: modelarts.TotalVersionCount,
  808. DatasetNames: datasetNames,
  809. DatasetInfos: datasetInfos,
  810. Spec: spec,
  811. CodeName: strings.ToLower(repo.Name),
  812. }
  813. if form.ModelName != "" { //使用预训练模型训练
  814. req.ModelName = form.ModelName
  815. req.LabelName = form.LabelName
  816. req.CkptName = form.CkptName
  817. req.ModelVersion = form.ModelVersion
  818. req.PreTrainModelUrl = form.PreTrainModelUrl
  819. req.PreTrainModelPath = preTrainModelPath
  820. }
  821. _, err = grampus.GenerateTrainJob(ctx, req)
  822. if err != nil {
  823. log.Error("GenerateTrainJob failed:%v", err.Error())
  824. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  825. ctx.RenderWithErr(err.Error(), tpl, &form)
  826. return
  827. }
  828. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  829. }
  830. func GetGrampusNotebook(ctx *context.APIContext) {
  831. var (
  832. err error
  833. )
  834. ID := ctx.Params(":id")
  835. job, err := models.GetCloudbrainByID(ID)
  836. if err != nil {
  837. ctx.NotFound("", err)
  838. log.Error("GetCloudbrainByID failed:", err)
  839. return
  840. }
  841. jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
  842. aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
  843. if err != nil {
  844. ctx.NotFound(err)
  845. log.Error("Sync cloud brain one status failed:", err)
  846. return
  847. }
  848. ctx.JSON(http.StatusOK, map[string]interface{}{
  849. "ID": ID,
  850. "JobName": jobAfter.JobName,
  851. "JobStatus": jobAfter.Status,
  852. "AiCenter": aiCenterName,
  853. "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
  854. "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
  855. "JobDuration": jobAfter.TrainJobDuration,
  856. })
  857. }
  858. func GrampusStopJob(ctx *context.Context) {
  859. var ID = ctx.Params(":id")
  860. var resultCode = "0"
  861. var errorMsg = ""
  862. var status = ""
  863. task := ctx.Cloudbrain
  864. for {
  865. if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
  866. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  867. resultCode = "-1"
  868. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  869. break
  870. }
  871. res, err := grampus.StopJob(task.JobID, task.JobType)
  872. if err != nil {
  873. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  874. resultCode = strconv.Itoa(res.ErrorCode)
  875. errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
  876. break
  877. }
  878. oldStatus := task.Status
  879. task.Status = getStopJobResponseStatus(res)
  880. if task.EndTime == 0 {
  881. task.EndTime = timeutil.TimeStampNow()
  882. }
  883. task.ComputeAndSetDuration()
  884. if oldStatus != task.Status {
  885. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  886. }
  887. err = models.UpdateJob(task)
  888. if err != nil {
  889. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  890. resultCode = "-1"
  891. errorMsg = "system error"
  892. break
  893. }
  894. status = task.Status
  895. break
  896. }
  897. ctx.JSON(200, map[string]interface{}{
  898. "result_code": resultCode,
  899. "error_msg": errorMsg,
  900. "status": status,
  901. "id": ID,
  902. "StatusOK": 0,
  903. })
  904. }
  905. func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
  906. newStatus := models.GrampusStatusStopping
  907. if res.Status != "" {
  908. newStatus = grampus.TransTrainJobStatus(res.Status)
  909. }
  910. return newStatus
  911. }
  912. func GrampusNotebookDel(ctx *context.Context) {
  913. var listType = ctx.Query("listType")
  914. if err := deleteGrampusJob(ctx); err != nil {
  915. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  916. ctx.ServerError(err.Error(), err)
  917. return
  918. }
  919. var isAdminPage = ctx.Query("isadminpage")
  920. var isHomePage = ctx.Query("ishomepage")
  921. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  922. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  923. } else if isHomePage == "true" {
  924. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  925. } else {
  926. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  927. }
  928. }
  929. func GrampusTrainJobDel(ctx *context.Context) {
  930. var listType = ctx.Query("listType")
  931. if err := deleteGrampusJob(ctx); err != nil {
  932. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  933. ctx.ServerError(err.Error(), err)
  934. return
  935. }
  936. var isAdminPage = ctx.Query("isadminpage")
  937. var isHomePage = ctx.Query("ishomepage")
  938. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  939. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  940. } else if isHomePage == "true" {
  941. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  942. } else {
  943. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  944. }
  945. }
  946. func deleteGrampusJob(ctx *context.Context) error {
  947. task := ctx.Cloudbrain
  948. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  949. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  950. return errors.New(ctx.Tr("cloudbrain.Not_Stopped"))
  951. }
  952. err := models.DeleteJob(task)
  953. if err != nil {
  954. log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
  955. return err
  956. }
  957. storageType := models.TypeCloudBrainOne
  958. if task.ComputeResource == models.NPUResource {
  959. storageType = models.TypeCloudBrainTwo
  960. }
  961. DeleteCloudbrainJobStorage(task.JobName, storageType)
  962. return nil
  963. }
  964. type NotebookDataset struct {
  965. DatasetUrl string `json:"dataset_url"`
  966. }
  967. func GrampusNotebookShow(ctx *context.Context) {
  968. ctx.Data["PageIsCloudBrain"] = true
  969. var task *models.Cloudbrain
  970. task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
  971. if err != nil {
  972. log.Error("GetCloudbrainByID failed:" + err.Error())
  973. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  974. return
  975. }
  976. task.ContainerIp = ""
  977. if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
  978. result, err := grampus.GetNotebookJob(task.JobID)
  979. if err != nil {
  980. log.Error("GetJob failed:" + err.Error())
  981. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  982. return
  983. }
  984. if result != nil {
  985. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  986. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  987. }
  988. oldStatus := task.Status
  989. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  990. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  991. task.Duration = result.JobInfo.RunSec
  992. if task.Duration < 0 {
  993. task.Duration = 0
  994. }
  995. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  996. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  997. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  998. }
  999. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1000. task.EndTime = task.StartTime.Add(task.Duration)
  1001. }
  1002. task.CorrectCreateUnix()
  1003. if oldStatus != task.Status {
  1004. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1005. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1006. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1007. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1008. }
  1009. }
  1010. }
  1011. }
  1012. err = models.UpdateJob(task)
  1013. if err != nil {
  1014. log.Error("UpdateJob failed:" + err.Error())
  1015. }
  1016. }
  1017. }
  1018. if len(task.Parameters) > 0 {
  1019. var parameters models.Parameters
  1020. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1021. if err != nil {
  1022. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1023. ctx.ServerError("system error", err)
  1024. return
  1025. }
  1026. if len(parameters.Parameter) > 0 {
  1027. paramTemp := ""
  1028. for _, Parameter := range parameters.Parameter {
  1029. param := Parameter.Label + " = " + Parameter.Value + "; "
  1030. paramTemp = paramTemp + param
  1031. }
  1032. task.Parameters = paramTemp[:len(paramTemp)-2]
  1033. } else {
  1034. task.Parameters = ""
  1035. }
  1036. }
  1037. user, err := models.GetUserByID(task.UserID)
  1038. if err == nil {
  1039. task.User = user
  1040. }
  1041. prepareSpec4Show(ctx, task)
  1042. ctx.Data["task"] = task
  1043. ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
  1044. ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
  1045. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1046. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1047. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  1048. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  1049. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  1050. ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
  1051. }
  1052. func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
  1053. datasetDownload := make([]*models.DatasetDownload, 0)
  1054. if ctx.IsSigned {
  1055. if task.Uuid != "" && task.UserID == ctx.User.ID {
  1056. if task.IsGPUTask() {
  1057. return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1058. } else {
  1059. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1060. datasetObsUrlList := make([]NotebookDataset, 0)
  1061. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1062. for _, datasetInfo := range datasetDownload {
  1063. for _, datasetObs := range datasetObsUrlList {
  1064. log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
  1065. if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
  1066. datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
  1067. break
  1068. }
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. return datasetDownload
  1075. }
  1076. func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
  1077. var modelDownload models.ModelDownload
  1078. if ctx.IsSigned {
  1079. if task.ModelName != "" && task.UserID == ctx.User.ID {
  1080. if task.IsNPUTask() {
  1081. modelDownload = models.ModelDownload{
  1082. Name: task.CkptName,
  1083. DownloadLink: "",
  1084. IsDelete: false,
  1085. }
  1086. if !HasModelFile(task) {
  1087. modelDownload.IsDelete = true
  1088. }
  1089. datasetObsUrlList := make([]NotebookDataset, 0)
  1090. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1091. for _, datasetObs := range datasetObsUrlList {
  1092. if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
  1093. modelDownload.DownloadLink = datasetObs.DatasetUrl
  1094. break
  1095. }
  1096. }
  1097. }
  1098. }
  1099. }
  1100. return &modelDownload
  1101. }
  1102. func GrampusTrainJobShow(ctx *context.Context) {
  1103. ctx.Data["PageIsCloudBrain"] = true
  1104. var task *models.Cloudbrain
  1105. task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
  1106. if err != nil {
  1107. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  1108. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1109. return
  1110. }
  1111. task.ContainerIp = ""
  1112. task.User, _ = models.GetUserByID(task.UserID)
  1113. if task.DeletedAt.IsZero() { //normal record
  1114. result, err := grampus.GetJob(task.JobID)
  1115. if err != nil {
  1116. log.Error("GetJob failed:" + err.Error())
  1117. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1118. return
  1119. }
  1120. if result != nil {
  1121. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1122. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1123. }
  1124. oldStatus := task.Status
  1125. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1126. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1127. task.Duration = result.JobInfo.RunSec
  1128. if task.Duration < 0 {
  1129. task.Duration = 0
  1130. }
  1131. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1132. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1133. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1134. }
  1135. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1136. task.EndTime = task.StartTime.Add(task.Duration)
  1137. }
  1138. task.CorrectCreateUnix()
  1139. if oldStatus != task.Status {
  1140. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1141. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1142. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1143. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1144. }
  1145. }
  1146. }
  1147. }
  1148. err = models.UpdateJob(task)
  1149. if err != nil {
  1150. log.Error("UpdateJob failed:" + err.Error())
  1151. }
  1152. }
  1153. }
  1154. if len(task.Parameters) > 0 {
  1155. var parameters models.Parameters
  1156. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1157. if err != nil {
  1158. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1159. ctx.ServerError("system error", err)
  1160. return
  1161. }
  1162. if len(parameters.Parameter) > 0 {
  1163. paramTemp := ""
  1164. for _, Parameter := range parameters.Parameter {
  1165. param := Parameter.Label + " = " + Parameter.Value + "; "
  1166. paramTemp = paramTemp + param
  1167. }
  1168. task.Parameters = paramTemp[:len(paramTemp)-2]
  1169. } else {
  1170. task.Parameters = ""
  1171. }
  1172. }
  1173. taskList := make([]*models.Cloudbrain, 0)
  1174. taskList = append(taskList, task)
  1175. prepareSpec4Show(ctx, task)
  1176. ctx.Data["version_list_task"] = taskList
  1177. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1178. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1179. ctx.Data["displayJobName"] = task.DisplayJobName
  1180. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1181. ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
  1182. }
  1183. func GrampusDownloadLog(ctx *context.Context) {
  1184. jobID := ctx.Params(":jobid")
  1185. job, err := models.GetCloudbrainByJobID(jobID)
  1186. if err != nil {
  1187. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1188. ctx.ServerError(err.Error(), err)
  1189. return
  1190. }
  1191. content, err := grampus.GetTrainJobLog(job.JobID)
  1192. if err != nil {
  1193. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1194. content = ""
  1195. }
  1196. fileName := job.JobName + "-log.txt"
  1197. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
  1198. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1199. var b []byte = []byte(content)
  1200. ctx.Resp.Write(b)
  1201. }
  1202. func GrampusGetLog(ctx *context.Context) {
  1203. jobID := ctx.Params(":jobid")
  1204. job, err := models.GetCloudbrainByJobID(jobID)
  1205. if err != nil {
  1206. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1207. ctx.ServerError(err.Error(), err)
  1208. return
  1209. }
  1210. content, err := grampus.GetTrainJobLog(job.JobID)
  1211. if err != nil {
  1212. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1213. ctx.JSON(http.StatusOK, map[string]interface{}{
  1214. "JobName": job.JobName,
  1215. "Content": "",
  1216. "CanLogDownload": false,
  1217. })
  1218. return
  1219. }
  1220. result, err := grampus.GetJob(jobID)
  1221. if err != nil {
  1222. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  1223. ctx.JSON(http.StatusOK, map[string]interface{}{
  1224. "JobName": job.JobName,
  1225. "Content": content,
  1226. "CanLogDownload": false,
  1227. })
  1228. return
  1229. }
  1230. if result != nil {
  1231. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1232. if job.Status == models.GrampusStatusFailed {
  1233. content = content + "\n" + result.ExitDiagnostics
  1234. }
  1235. }
  1236. canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
  1237. ctx.JSON(http.StatusOK, map[string]interface{}{
  1238. "JobName": job.JobName,
  1239. "Content": content,
  1240. "CanLogDownload": canLogDownload,
  1241. })
  1242. return
  1243. }
  1244. func GrampusMetrics(ctx *context.Context) {
  1245. jobID := ctx.Params(":jobid")
  1246. job, err := models.GetCloudbrainByJobID(jobID)
  1247. if err != nil {
  1248. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1249. ctx.ServerError(err.Error(), err)
  1250. return
  1251. }
  1252. result, err := grampus.GetGrampusMetrics(job.JobID)
  1253. if err != nil {
  1254. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1255. }
  1256. ctx.JSON(http.StatusOK, map[string]interface{}{
  1257. "JobID": jobID,
  1258. "Interval": result.Interval,
  1259. "MetricsInfo": result.MetricsInfo,
  1260. })
  1261. return
  1262. }
  1263. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
  1264. var command string
  1265. //prepare
  1266. workDir := grampus.NpuWorkDir
  1267. if processorType == grampus.ProcessorTypeNPU {
  1268. command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu
  1269. } else if processorType == grampus.ProcessorTypeGPU {
  1270. workDir = grampus.GpuWorkDir
  1271. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  1272. }
  1273. //download code & dataset
  1274. if processorType == grampus.ProcessorTypeNPU {
  1275. //no need to download code & dataset by internet
  1276. } else if processorType == grampus.ProcessorTypeGPU {
  1277. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  1278. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  1279. command += commandDownload
  1280. }
  1281. //unzip code & dataset
  1282. if processorType == grampus.ProcessorTypeNPU {
  1283. //no need to process
  1284. } else if processorType == grampus.ProcessorTypeGPU {
  1285. unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
  1286. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  1287. command += commandUnzip
  1288. }
  1289. command += "echo \"unzip finished;start to exec code;\";"
  1290. // set export
  1291. var commandExport string
  1292. if processorType == grampus.ProcessorTypeNPU {
  1293. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  1294. } else if processorType == grampus.ProcessorTypeGPU {
  1295. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  1296. }
  1297. command += commandExport
  1298. //exec code
  1299. var parameters models.Parameters
  1300. var paramCode string
  1301. if len(paramSrc) != 0 {
  1302. err := json.Unmarshal([]byte(paramSrc), &parameters)
  1303. if err != nil {
  1304. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  1305. return command, err
  1306. }
  1307. for _, parameter := range parameters.Parameter {
  1308. paramCode += " --" + parameter.Label + "=" + parameter.Value
  1309. }
  1310. }
  1311. var commandCode string
  1312. if processorType == grampus.ProcessorTypeNPU {
  1313. paramCode += " --model_url=" + modelRemoteObsUrl
  1314. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";"
  1315. } else if processorType == grampus.ProcessorTypeGPU {
  1316. if pretrainModelFileName != "" {
  1317. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  1318. }
  1319. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  1320. }
  1321. command += commandCode
  1322. //get exec result
  1323. commandGetRes := "result=$?;"
  1324. command += commandGetRes
  1325. //upload models
  1326. if processorType == grampus.ProcessorTypeNPU {
  1327. // no need to upload
  1328. } else if processorType == grampus.ProcessorTypeGPU {
  1329. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  1330. command += commandUpload
  1331. }
  1332. //check exec result
  1333. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  1334. command += commandCheckRes
  1335. return command, nil
  1336. }
  1337. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  1338. commandDownloadTemp := commandDownload
  1339. if pretrainModelPath != "" {
  1340. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  1341. }
  1342. commandDownloadTemp += ";"
  1343. return commandDownloadTemp
  1344. }
  1345. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  1346. archiveType := git.ZIP
  1347. archivePath := codePath
  1348. if !com.IsDir(archivePath) {
  1349. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  1350. log.Error("MkdirAll failed:" + err.Error())
  1351. return err
  1352. }
  1353. }
  1354. // Get corresponding commit.
  1355. var (
  1356. commit *git.Commit
  1357. err error
  1358. )
  1359. gitRepo := ctx.Repo.GitRepo
  1360. if err != nil {
  1361. log.Error("OpenRepository failed:" + err.Error())
  1362. return err
  1363. }
  1364. if gitRepo.IsBranchExist(branchName) {
  1365. commit, err = gitRepo.GetBranchCommit(branchName)
  1366. if err != nil {
  1367. log.Error("GetBranchCommit failed:" + err.Error())
  1368. return err
  1369. }
  1370. } else {
  1371. log.Error("the branch is not exist: " + branchName)
  1372. return fmt.Errorf("The branch does not exist.")
  1373. }
  1374. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  1375. if !com.IsFile(archivePath) {
  1376. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  1377. Format: archiveType,
  1378. Prefix: setting.Repository.PrefixArchiveFiles,
  1379. }); err != nil {
  1380. log.Error("CreateArchive failed:" + err.Error())
  1381. return err
  1382. }
  1383. }
  1384. return nil
  1385. }
  1386. func HandleTaskWithAiCenter(ctx *context.Context) {
  1387. log.Info("HandleTaskWithAiCenter start")
  1388. updateCounts := 0
  1389. cloudBrains, err := models.GetC2NetWithAiCenterWrongJob()
  1390. if err != nil {
  1391. log.Error("GetC2NetWithAiCenterWrongJob failed:" + err.Error())
  1392. return
  1393. }
  1394. if len(cloudBrains) == 0 {
  1395. log.Info("HandleC2NetWithAiCenterWrongJob:no task need handle")
  1396. return
  1397. }
  1398. cloudBrainCounts := len(cloudBrains)
  1399. for _, task := range cloudBrains {
  1400. result, err := grampus.GetJob(task.JobID)
  1401. if err != nil {
  1402. log.Error("GetJob failed:" + err.Error())
  1403. continue
  1404. }
  1405. if len(result.JobInfo.Tasks) != 0 {
  1406. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1407. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1408. }
  1409. err = models.UpdateJob(task)
  1410. if err != nil {
  1411. log.Error("UpdateJob failed:" + err.Error())
  1412. }
  1413. updateCounts++
  1414. }
  1415. }
  1416. r := make(map[string]interface{}, 0)
  1417. r["cloudBrainCounts"] = cloudBrainCounts
  1418. r["updateCounts"] = updateCounts
  1419. ctx.JSON(http.StatusOK, response.SuccessWithData(r))
  1420. }
  1421. func GrampusNotebookDebug(ctx *context.Context) {
  1422. result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
  1423. if err != nil {
  1424. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  1425. return
  1426. }
  1427. if len(result.JobInfo.Tasks) > 0 {
  1428. ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
  1429. return
  1430. }
  1431. ctx.NotFound("Can not find the job.", nil)
  1432. }
  1433. func GrampusNotebookRestart(ctx *context.Context) {
  1434. var id = ctx.Params(":id")
  1435. var resultCode = "-1"
  1436. var errorMsg = ""
  1437. var status = ""
  1438. var spec *models.Specification
  1439. task := ctx.Cloudbrain
  1440. if ctx.Written() {
  1441. return
  1442. }
  1443. for {
  1444. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  1445. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  1446. errorMsg = "the job is not stopped"
  1447. break
  1448. }
  1449. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
  1450. if err != nil {
  1451. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1452. errorMsg = "system error"
  1453. break
  1454. } else {
  1455. if count >= 1 {
  1456. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1457. resultCode = "2"
  1458. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  1459. break
  1460. }
  1461. }
  1462. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  1463. if err != nil || oldSpec == nil {
  1464. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  1465. errorMsg = "Resource specification not available"
  1466. break
  1467. }
  1468. computeSourceSimple := models.GPU
  1469. action := models.ActionCreateGrampusGPUDebugTask
  1470. if task.ComputeResource == models.NPUResource {
  1471. computeSourceSimple = models.NPU
  1472. action = models.ActionCreateGrampusNPUDebugTask
  1473. } else if task.ComputeResource == models.GCUResource {
  1474. computeSourceSimple = models.GCU
  1475. action = models.ActionCreateGrampusGCUDebugTask
  1476. }
  1477. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  1478. JobType: models.JobType(task.JobType),
  1479. ComputeResource: computeSourceSimple,
  1480. Cluster: models.C2NetCluster,
  1481. })
  1482. if err != nil || spec == nil {
  1483. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  1484. errorMsg = "Resource specification not support any more"
  1485. break
  1486. }
  1487. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1488. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1489. errorMsg = ctx.Tr("points.insufficient_points_balance")
  1490. break
  1491. }
  1492. if task.IsGPUTask() || task.IsGCUTask() {
  1493. if _, err := os.Stat(getOldJobPath(task)); err != nil {
  1494. log.Error("Can not find job minio path", err)
  1495. resultCode = "-1"
  1496. errorMsg = ctx.Tr("cloudbrain.result_cleared")
  1497. break
  1498. }
  1499. }
  1500. if !HasModelFile(task) { //使用预训练模型训练
  1501. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  1502. break
  1503. }
  1504. if hasDatasetDeleted(task) {
  1505. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  1506. break
  1507. }
  1508. createTime := timeutil.TimeStampNow()
  1509. res, err := grampus.RestartNotebookJob(task.JobID)
  1510. if err != nil {
  1511. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  1512. errorMsg = ctx.Tr("repo.debug_again_fail")
  1513. break
  1514. }
  1515. if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
  1516. log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
  1517. errorMsg = ctx.Tr("repo.debug_again_fail")
  1518. if res.GrampusResult.ErrorCode == 5005 {
  1519. errorMsg = ctx.Tr("repo.debug_again_fail_forever")
  1520. }
  1521. break
  1522. }
  1523. newTask := &models.Cloudbrain{
  1524. Status: res.Status,
  1525. UserID: task.UserID,
  1526. RepoID: task.RepoID,
  1527. JobID: res.NewId,
  1528. JobName: task.JobName,
  1529. DisplayJobName: task.DisplayJobName,
  1530. JobType: task.JobType,
  1531. Type: task.Type,
  1532. Uuid: task.Uuid,
  1533. Image: task.Image,
  1534. ImageID: task.ImageID,
  1535. EngineID: task.EngineID,
  1536. CommitID: task.CommitID,
  1537. EngineName: task.EngineName,
  1538. IsLatestVersion: "1",
  1539. BranchName: task.BranchName,
  1540. DatasetName: task.DatasetName,
  1541. ComputeResource: task.ComputeResource,
  1542. Description: task.Description,
  1543. CreatedUnix: createTime,
  1544. UpdatedUnix: createTime,
  1545. Spec: spec,
  1546. ModelName: task.ModelName,
  1547. ModelVersion: task.ModelVersion,
  1548. LabelName: task.LabelName,
  1549. PreTrainModelUrl: task.PreTrainModelUrl,
  1550. CkptName: task.CkptName,
  1551. WorkServerNumber: 1,
  1552. }
  1553. err = models.RestartCloudbrain(task, newTask)
  1554. if err != nil {
  1555. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  1556. errorMsg = "system error"
  1557. break
  1558. }
  1559. id = strconv.FormatInt(newTask.ID, 10)
  1560. status = res.Status
  1561. resultCode = "0"
  1562. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
  1563. break
  1564. }
  1565. ctx.JSON(200, map[string]string{
  1566. "result_code": resultCode,
  1567. "error_msg": errorMsg,
  1568. "status": status,
  1569. "id": id,
  1570. })
  1571. }