You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 14 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. package repo
  2. import (
  3. "code.gitea.io/gitea/modules/auth"
  4. "code.gitea.io/gitea/modules/git"
  5. "code.gitea.io/gitea/modules/grampus"
  6. "code.gitea.io/gitea/modules/modelarts"
  7. "code.gitea.io/gitea/modules/util"
  8. "encoding/json"
  9. "errors"
  10. "io/ioutil"
  11. "net/http"
  12. "os"
  13. "path"
  14. "strconv"
  15. "strings"
  16. "time"
  17. "code.gitea.io/gitea/models"
  18. "code.gitea.io/gitea/modules/base"
  19. "code.gitea.io/gitea/modules/cloudbrain"
  20. "code.gitea.io/gitea/modules/context"
  21. "code.gitea.io/gitea/modules/log"
  22. "code.gitea.io/gitea/modules/setting"
  23. )
  24. const (
  25. //GPU
  26. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  27. tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show"
  28. //NPU
  29. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  30. tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show"
  31. )
  32. func GrampusTrainJobGPUNew(ctx *context.Context) {
  33. err := grampusGpuNewDataPrepare(ctx)
  34. if err != nil {
  35. ctx.ServerError("get new train-job info failed", err)
  36. return
  37. }
  38. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  39. }
  40. func grampusGpuNewDataPrepare(ctx *context.Context) error {
  41. ctx.Data["PageIsCloudBrain"] = true
  42. t := time.Now()
  43. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  44. ctx.Data["display_job_name"] = displayJobName
  45. //get valid images
  46. result, err := cloudbrain.GetImages()
  47. if err != nil {
  48. ctx.Data["error"] = err.Error()
  49. log.Error("cloudbrain.GetImages failed:", err.Error(), ctx.Data["MsgID"])
  50. }
  51. for i, payload := range result.Payload.ImageInfo {
  52. if strings.HasPrefix(result.Payload.ImageInfo[i].Place, "192.168") {
  53. result.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)]
  54. } else {
  55. result.Payload.ImageInfo[i].PlaceView = payload.Place
  56. }
  57. }
  58. ctx.Data["images"] = result.Payload.ImageInfo
  59. resultPublic, err := cloudbrain.GetPublicImages()
  60. if err != nil {
  61. ctx.Data["error"] = err.Error()
  62. log.Error("cloudbrain.GetPublicImages failed:", err.Error(), ctx.Data["MsgID"])
  63. }
  64. for i, payload := range resultPublic.Payload.ImageInfo {
  65. if strings.HasPrefix(resultPublic.Payload.ImageInfo[i].Place, "192.168") {
  66. resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)]
  67. } else {
  68. resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place
  69. }
  70. }
  71. ctx.Data["public_images"] = resultPublic.Payload.ImageInfo
  72. //get valid dataset
  73. attachs, err := models.GetAllUserAttachments(ctx.User.ID)
  74. if err != nil {
  75. log.Error("GetAllUserAttachments failed: %v", err, ctx.Data["MsgID"])
  76. return err
  77. }
  78. ctx.Data["attachments"] = attachs
  79. ctx.Data["command"] = cloudbrain.Command
  80. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  81. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  82. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  83. ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath
  84. ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled
  85. //get valid resource specs
  86. if categories == nil {
  87. json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
  88. }
  89. ctx.Data["benchmark_categories"] = categories.Category
  90. ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
  91. if gpuInfos == nil {
  92. json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
  93. }
  94. ctx.Data["gpu_types"] = gpuInfos.GpuInfo
  95. if trainGpuInfos == nil {
  96. json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
  97. }
  98. ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo
  99. if benchmarkGpuInfos == nil {
  100. json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
  101. }
  102. ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo
  103. if benchmarkResourceSpecs == nil {
  104. json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
  105. }
  106. ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec
  107. if cloudbrain.ResourceSpecs == nil {
  108. json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
  109. }
  110. ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec
  111. if cloudbrain.TrainResourceSpecs == nil {
  112. json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
  113. }
  114. ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec
  115. ctx.Data["params"] = ""
  116. ctx.Data["branchName"] = ctx.Repo.BranchName
  117. ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
  118. ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
  119. ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
  120. ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
  121. ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne
  122. ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
  123. return nil
  124. }
  125. func GrampusTrainJobNPUNew(ctx *context.Context) {
  126. err := grampusTrainJobNpuNewDataPrepare(ctx)
  127. if err != nil {
  128. ctx.ServerError("get new train-job info failed", err)
  129. return
  130. }
  131. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  132. }
  133. func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error {
  134. ctx.Data["PageIsCloudBrain"] = true
  135. t := time.Now()
  136. var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  137. ctx.Data["display_job_name"] = displayJobName
  138. //get valid dataset
  139. attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
  140. if err != nil {
  141. ctx.ServerError("GetAllUserAttachments failed:", err)
  142. return err
  143. }
  144. ctx.Data["attachments"] = attachs
  145. //get valid resource specs
  146. var resourcePools modelarts.ResourcePool
  147. if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  148. ctx.ServerError("json.Unmarshal failed:", err)
  149. return err
  150. }
  151. ctx.Data["resource_pools"] = resourcePools.Info
  152. var engines modelarts.Engine
  153. if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  154. ctx.ServerError("json.Unmarshal failed:", err)
  155. return err
  156. }
  157. ctx.Data["engines"] = engines.Info
  158. var versionInfos modelarts.VersionInfo
  159. if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  160. ctx.ServerError("json.Unmarshal failed:", err)
  161. return err
  162. }
  163. ctx.Data["engine_versions"] = versionInfos.Version
  164. var flavorInfos modelarts.Flavor
  165. if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
  166. ctx.ServerError("json.Unmarshal failed:", err)
  167. return err
  168. }
  169. ctx.Data["flavor_infos"] = flavorInfos.Info
  170. ctx.Data["params"] = ""
  171. ctx.Data["branchName"] = ctx.Repo.BranchName
  172. configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
  173. if err != nil {
  174. ctx.ServerError("getConfigList failed:", err)
  175. return err
  176. }
  177. ctx.Data["config_list"] = configList.ParaConfigs
  178. ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
  179. return nil
  180. }
  181. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  182. if !strings.HasSuffix(form.BootFile, ".py") {
  183. log.Error("the boot file(%s) must be a python file", form.BootFile)
  184. return errors.New("启动文件必须是python文件")
  185. }
  186. if form.BranchName == "" {
  187. log.Error("the branch must not be null!", form.BranchName)
  188. return errors.New("代码分支不能为空!")
  189. }
  190. return nil
  191. }
  192. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  193. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  194. displayJobName := form.DisplayJobName
  195. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  196. //todo:del
  197. jobName = displayJobName
  198. uuid := form.Attachment
  199. description := form.Description
  200. bootFile := form.BootFile
  201. params := form.Params
  202. repo := ctx.Repo.Repository
  203. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  204. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  205. dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
  206. branchName := form.BranchName
  207. isLatestVersion := modelarts.IsLatestVersion
  208. FlavorName := form.FlavorName
  209. VersionCount := modelarts.VersionCount
  210. EngineName := form.EngineName
  211. log.Info(jobName)
  212. count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
  213. if err != nil {
  214. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  215. grampusTrainJobNpuNewDataPrepare(ctx)
  216. ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
  217. return
  218. } else {
  219. if count >= 1 {
  220. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  221. grampusTrainJobNpuNewDataPrepare(ctx)
  222. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form)
  223. return
  224. }
  225. }
  226. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  227. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  228. grampusTrainJobNpuNewDataPrepare(ctx)
  229. ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
  230. return
  231. }
  232. //check whether the task name in the project is duplicated
  233. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  234. if err == nil {
  235. if len(tasks) != 0 {
  236. log.Error("the job name did already exist", ctx.Data["MsgID"])
  237. grampusTrainJobNpuNewDataPrepare(ctx)
  238. ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form)
  239. return
  240. }
  241. } else {
  242. if !models.IsErrJobNotExist(err) {
  243. log.Error("system error, %v", err, ctx.Data["MsgID"])
  244. grampusTrainJobNpuNewDataPrepare(ctx)
  245. ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
  246. return
  247. }
  248. }
  249. //prepare code and out path
  250. _, err = ioutil.ReadDir(codeLocalPath)
  251. if err == nil {
  252. os.RemoveAll(codeLocalPath)
  253. }
  254. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  255. commitID, _ := gitRepo.GetBranchCommitID(branchName)
  256. if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
  257. log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
  258. grampusTrainJobNpuNewDataPrepare(ctx)
  259. ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form)
  260. return
  261. }
  262. //todo: upload code (send to file_server todo this work?)
  263. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  264. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  265. grampusTrainJobNpuNewDataPrepare(ctx)
  266. ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form)
  267. return
  268. }
  269. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  270. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  271. grampusTrainJobNpuNewDataPrepare(ctx)
  272. ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form)
  273. return
  274. }
  275. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  276. // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
  277. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  278. grampusTrainJobNpuNewDataPrepare(ctx)
  279. ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
  280. return
  281. }
  282. //prepare command
  283. //todo: download code, download dataset, unzip dataset, exec code, upload model
  284. var parameters models.Parameters
  285. param := make([]models.Parameter, 0)
  286. existDeviceTarget := false
  287. if len(params) != 0 {
  288. err := json.Unmarshal([]byte(params), &parameters)
  289. if err != nil {
  290. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  291. grampusTrainJobNpuNewDataPrepare(ctx)
  292. ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form)
  293. return
  294. }
  295. for _, parameter := range parameters.Parameter {
  296. if parameter.Label == modelarts.DeviceTarget {
  297. existDeviceTarget = true
  298. }
  299. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  300. param = append(param, models.Parameter{
  301. Label: parameter.Label,
  302. Value: parameter.Value,
  303. })
  304. }
  305. }
  306. }
  307. if !existDeviceTarget {
  308. param = append(param, models.Parameter{
  309. Label: modelarts.DeviceTarget,
  310. Value: modelarts.Ascend,
  311. })
  312. }
  313. req := &grampus.GenerateTrainJobReq{
  314. JobName: jobName,
  315. DisplayJobName: displayJobName,
  316. ComputeResource: models.NPUResource,
  317. Command: "echo \"test\"",
  318. ResourceSpecId: "modelarts.kat1.xlarge",
  319. ImageUrl: "",
  320. ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64",
  321. DataUrl: dataPath,
  322. Description: description,
  323. CodeObsPath: codeObsPath,
  324. BootFileUrl: codeObsPath + bootFile,
  325. BootFile: bootFile,
  326. //TrainUrl: outputObsPath,
  327. //FlavorCode: flavorCode,
  328. WorkServerNumber: 1,
  329. //EngineID: int64(engineID),
  330. //LogUrl: logObsPath,
  331. //PoolID: poolID,
  332. Uuid: uuid,
  333. //Parameters: param,
  334. CommitID: commitID,
  335. IsLatestVersion: isLatestVersion,
  336. BranchName: branchName,
  337. //Params: form.Params,
  338. FlavorName: FlavorName,
  339. EngineName: EngineName,
  340. VersionCount: VersionCount,
  341. TotalVersionCount: modelarts.TotalVersionCount,
  342. }
  343. //将params转换Parameters.Parameter,出错时返回给前端
  344. var Parameters modelarts.Parameters
  345. if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
  346. ctx.ServerError("json.Unmarshal failed:", err)
  347. return
  348. }
  349. err = grampus.GenerateTrainJob(ctx, req)
  350. if err != nil {
  351. log.Error("GenerateTrainJob failed:%v", err.Error())
  352. grampusTrainJobNpuNewDataPrepare(ctx)
  353. ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
  354. return
  355. }
  356. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  357. }