You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ai_model_convert.go 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "path"
  12. "strings"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/cloudbrain"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/git"
  17. "code.gitea.io/gitea/modules/log"
  18. "code.gitea.io/gitea/modules/modelarts"
  19. "code.gitea.io/gitea/modules/setting"
  20. "code.gitea.io/gitea/modules/storage"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. uuid "github.com/satori/go.uuid"
  23. )
  24. const (
  25. tplModelManageConvertIndex = "repo/modelmanage/convertIndex"
  26. tplModelConvertInfo = "repo/modelmanage/convertshowinfo"
  27. PYTORCH_ENGINE = 0
  28. TENSORFLOW_ENGINE = 1
  29. MINDSPORE_ENGIN = 2
  30. ModelMountPath = "/model"
  31. CodeMountPath = "/code"
  32. DataSetMountPath = "/dataset"
  33. LogFile = "log.txt"
  34. DefaultBranchName = "master"
  35. SubTaskName = "task1"
  36. //GpuQueue = "openidgx"
  37. Success = "S000"
  38. //GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap"
  39. //GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx"
  40. //NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend"
  41. //PytorchOnnxBootFile = "convert_pytorch.py"
  42. //PytorchTrTBootFile = "convert_pytorch_tensorrt.py"
  43. //MindsporeBootFile = "convert_mindspore.py"
  44. //TensorFlowNpuBootFile = "convert_tensorflow.py"
  45. //TensorFlowGpuBootFile = "convert_tensorflow_gpu.py"
  46. //ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test"
  47. CONVERT_FORMAT_ONNX = 0
  48. CONVERT_FORMAT_TRT = 1
  49. NetOutputFormat_FP32 = 0
  50. NetOutputFormat_FP16 = 1
  51. NPU_MINDSPORE_IMAGE_ID = 35
  52. NPU_TENSORFLOW_IMAGE_ID = 121
  53. //GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1
  54. //NPU_FlavorCode = "modelarts.bm.910.arm.public.1"
  55. //NPU_PoolID = "pool7908321a"
  56. )
  57. var (
  58. TrainResourceSpecs *models.ResourceSpecs
  59. )
  60. func SaveModelConvert(ctx *context.Context) {
  61. log.Info("save model convert start.")
  62. if !ctx.Repo.CanWrite(models.UnitTypeModelManage) {
  63. ctx.JSON(403, ctx.Tr("repo.model_noright"))
  64. return
  65. }
  66. name := ctx.Query("name")
  67. desc := ctx.Query("desc")
  68. modelId := ctx.Query("modelId")
  69. modelPath := ctx.Query("ModelFile")
  70. SrcEngine := ctx.QueryInt("SrcEngine")
  71. InputShape := ctx.Query("inputshape")
  72. InputDataFormat := ctx.Query("inputdataformat")
  73. DestFormat := ctx.QueryInt("DestFormat")
  74. NetOutputFormat := ctx.QueryInt("NetOutputFormat")
  75. task, err := models.QueryModelById(modelId)
  76. if err != nil {
  77. log.Error("no such model!", err.Error())
  78. ctx.ServerError("no such model:", err)
  79. return
  80. }
  81. uuid := uuid.NewV4()
  82. id := uuid.String()
  83. modelConvert := &models.AiModelConvert{
  84. ID: id,
  85. Name: name,
  86. Description: desc,
  87. Status: string(models.JobWaiting),
  88. SrcEngine: SrcEngine,
  89. RepoId: ctx.Repo.Repository.ID,
  90. ModelName: task.Name,
  91. ModelVersion: task.Version,
  92. ModelId: modelId,
  93. ModelPath: modelPath,
  94. DestFormat: DestFormat,
  95. NetOutputFormat: NetOutputFormat,
  96. InputShape: InputShape,
  97. InputDataFormat: InputDataFormat,
  98. UserId: ctx.User.ID,
  99. }
  100. models.SaveModelConvert(modelConvert)
  101. go goCreateTask(modelConvert, ctx, task)
  102. ctx.JSON(200, map[string]string{
  103. "result_code": "0",
  104. })
  105. }
  106. func goCreateTask(modelConvert *models.AiModelConvert, ctx *context.Context, task *models.AiModelManage) error {
  107. if modelConvert.IsGpuTrainTask() {
  108. log.Info("create gpu train job.")
  109. return createGpuTrainJob(modelConvert, ctx, task)
  110. } else {
  111. //create npu job
  112. log.Info("create npu train job.")
  113. return createNpuTrainJob(modelConvert, ctx, task.Path)
  114. }
  115. }
  116. func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error {
  117. VersionOutputPath := "V0001"
  118. codeLocalPath := setting.JobPath + modelConvert.ID + modelarts.CodePath
  119. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.CodePath
  120. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"
  121. logObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"
  122. dataPath := "/" + modelRelativePath
  123. _, err := ioutil.ReadDir(codeLocalPath)
  124. if err == nil {
  125. os.RemoveAll(codeLocalPath)
  126. }
  127. if err := downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil {
  128. log.Error("downloadCode failed, server timed out: %s (%v)", setting.ModelConvert.ConvertRepoPath, err)
  129. return err
  130. }
  131. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  132. log.Error("Failed to obsMkdir_output: %s (%v)", modelConvert.ID+modelarts.OutputPath, err)
  133. return err
  134. }
  135. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  136. log.Error("Failed to obsMkdir_log: %s (%v)", modelConvert.ID+modelarts.LogPath, err)
  137. return err
  138. }
  139. if err := uploadCodeToObs(codeLocalPath, modelConvert.ID, ""); err != nil {
  140. log.Error("Failed to uploadCodeToObs: %s (%v)", modelConvert.ID, err)
  141. return err
  142. }
  143. intputshape := strings.Split(modelConvert.InputShape, ",")
  144. n := "256"
  145. c := "1"
  146. h := "28"
  147. w := "28"
  148. if len(intputshape) == 4 {
  149. n = intputshape[0]
  150. c = intputshape[1]
  151. h = intputshape[2]
  152. w = intputshape[3]
  153. }
  154. var engineId int64
  155. engineId = int64(NPU_MINDSPORE_IMAGE_ID)
  156. bootfile := setting.ModelConvert.MindsporeBootFile
  157. if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  158. engineId = int64(NPU_TENSORFLOW_IMAGE_ID)
  159. bootfile = setting.ModelConvert.TensorFlowNpuBootFile
  160. }
  161. userCommand := "/bin/bash /home/work/run_train.sh 's3://" + codeObsPath + "' 'code/" + bootfile + "' '/tmp/log/train.log' --'data_url'='s3://" + dataPath + "' --'train_url'='s3://" + outputObsPath + "'"
  162. userCommand += " --'model'='" + modelConvert.ModelPath + "'"
  163. userCommand += " --'n'='" + fmt.Sprint(n) + "'"
  164. userCommand += " --'c'='" + fmt.Sprint(c) + "'"
  165. userCommand += " --'h'='" + fmt.Sprint(h) + "'"
  166. userCommand += " --'w'='" + fmt.Sprint(w) + "'"
  167. req := &modelarts.GenerateTrainJobReq{
  168. JobName: modelConvert.ID,
  169. DisplayJobName: modelConvert.Name,
  170. DataUrl: dataPath,
  171. Description: modelConvert.Description,
  172. CodeObsPath: codeObsPath,
  173. BootFileUrl: codeObsPath + bootfile,
  174. BootFile: bootfile,
  175. TrainUrl: outputObsPath,
  176. FlavorCode: setting.ModelConvert.NPU_FlavorCode,
  177. WorkServerNumber: 1,
  178. IsLatestVersion: modelarts.IsLatestVersion,
  179. EngineID: engineId,
  180. LogUrl: logObsPath,
  181. PoolID: setting.ModelConvert.NPU_PoolID,
  182. //Parameters: param,
  183. BranchName: DefaultBranchName,
  184. UserImageUrl: setting.ModelConvert.NPU_MINDSPORE_16_IMAGE,
  185. UserCommand: userCommand,
  186. }
  187. result, err := modelarts.GenerateModelConvertTrainJob(req)
  188. if err == nil {
  189. log.Info("jobId=" + fmt.Sprint(result.JobID) + " versionid=" + fmt.Sprint(result.VersionID))
  190. models.UpdateModelConvertModelArts(modelConvert.ID, fmt.Sprint(result.JobID), fmt.Sprint(result.VersionID))
  191. } else {
  192. log.Info("create modelarts taks failed.error=" + err.Error())
  193. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  194. }
  195. return err
  196. }
  197. func downloadConvertCode(repopath string, codePath, branchName string) error {
  198. //add "file:///" prefix to make the depth valid
  199. if err := git.Clone(repopath, codePath, git.CloneRepoOptions{Branch: branchName, Depth: 1}); err != nil {
  200. log.Error("Failed to clone repository: %s (%v)", repopath, err)
  201. return err
  202. }
  203. log.Info("srcPath=" + repopath + " codePath=" + codePath)
  204. configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666)
  205. if err != nil {
  206. log.Error("open file(%s) failed:%v", codePath+"/,git/config", err)
  207. return err
  208. }
  209. defer configFile.Close()
  210. pos := int64(0)
  211. reader := bufio.NewReader(configFile)
  212. for {
  213. line, err := reader.ReadString('\n')
  214. if err != nil {
  215. if err == io.EOF {
  216. log.Error("not find the remote-url")
  217. return nil
  218. } else {
  219. log.Error("read error: %v", err)
  220. return err
  221. }
  222. }
  223. if strings.Contains(line, "url") && strings.Contains(line, ".git") {
  224. originUrl := "\turl = " + repopath + "\n"
  225. if len(line) > len(originUrl) {
  226. originUrl += strings.Repeat(" ", len(line)-len(originUrl))
  227. }
  228. bytes := []byte(originUrl)
  229. _, err := configFile.WriteAt(bytes, pos)
  230. if err != nil {
  231. log.Error("WriteAt failed:%v", err)
  232. return err
  233. }
  234. break
  235. }
  236. pos += int64(len(line))
  237. }
  238. return nil
  239. }
  240. func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error {
  241. path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/"
  242. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  243. if err == nil {
  244. _, errState := os.Stat(localPath)
  245. if errState != nil {
  246. if err = os.MkdirAll(localPath, os.ModePerm); err != nil {
  247. return err
  248. }
  249. }
  250. for _, oneFile := range allFile {
  251. if oneFile.IsDir {
  252. log.Info(" dir name:" + oneFile.FileName)
  253. } else {
  254. allFileName := localPath + "/" + oneFile.FileName
  255. index := strings.LastIndex(allFileName, "/")
  256. if index != -1 {
  257. parentDir := allFileName[0:index]
  258. if err = os.MkdirAll(parentDir, os.ModePerm); err != nil {
  259. log.Info("make dir may be error," + err.Error())
  260. }
  261. }
  262. fDest, err := os.Create(allFileName)
  263. if err != nil {
  264. log.Info("create file error, download file failed: %s\n", err.Error())
  265. return err
  266. }
  267. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  268. if err != nil {
  269. log.Info("download file failed: %s\n", err.Error())
  270. return err
  271. } else {
  272. defer body.Close()
  273. p := make([]byte, 1024)
  274. var readErr error
  275. var readCount int
  276. // 读取对象内容
  277. for {
  278. readCount, readErr = body.Read(p)
  279. if readCount > 0 {
  280. fDest.Write(p[:readCount])
  281. }
  282. if readErr != nil {
  283. break
  284. }
  285. }
  286. }
  287. }
  288. }
  289. } else {
  290. log.Info("error,msg=" + err.Error())
  291. return err
  292. }
  293. return nil
  294. }
  295. func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error {
  296. modelRelativePath := model.Path
  297. command := ""
  298. IMAGE_URL := setting.ModelConvert.GPU_PYTORCH_IMAGE
  299. dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath
  300. if modelConvert.SrcEngine == PYTORCH_ENGINE {
  301. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  302. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile)
  303. } else if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  304. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile)
  305. } else {
  306. return errors.New("Not support the format.")
  307. }
  308. } else if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  309. IMAGE_URL = setting.ModelConvert.GPU_TENSORFLOW_IMAGE
  310. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  311. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile)
  312. } else {
  313. return errors.New("Not support the format.")
  314. }
  315. //如果模型在OBS上,需要下载到本地,并上传到minio中
  316. if model.Type == models.TypeCloudBrainTwo {
  317. relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset"
  318. log.Info("local dataset path:" + relatetiveModelPath)
  319. downloadFromObsToLocal(model, relatetiveModelPath)
  320. uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/")
  321. dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset"
  322. }
  323. }
  324. log.Info("dataActualPath=" + dataActualPath)
  325. log.Info("command=" + command)
  326. codePath := setting.JobPath + modelConvert.ID + CodeMountPath
  327. downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codePath, DefaultBranchName)
  328. uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/")
  329. minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code"
  330. log.Info("minio codePath=" + minioCodePath)
  331. modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/"
  332. log.Info("local modelPath=" + modelPath)
  333. mkModelPath(modelPath)
  334. uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/")
  335. minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model"
  336. log.Info("minio model path=" + minioModelPath)
  337. if TrainResourceSpecs == nil {
  338. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  339. }
  340. resourceSpec := TrainResourceSpecs.ResourceSpec[setting.ModelConvert.GPU_Resource_Specs_ID]
  341. jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{
  342. JobName: modelConvert.ID,
  343. RetryCount: 1,
  344. GpuType: setting.ModelConvert.GpuQueue,
  345. Image: IMAGE_URL,
  346. TaskRoles: []models.TaskRole{
  347. {
  348. Name: SubTaskName,
  349. TaskNumber: 1,
  350. MinSucceededTaskCount: 1,
  351. MinFailedTaskCount: 1,
  352. CPUNumber: resourceSpec.CpuNum,
  353. GPUNumber: resourceSpec.GpuNum,
  354. MemoryMB: resourceSpec.MemMiB,
  355. ShmMB: resourceSpec.ShareMemMiB,
  356. Command: command,
  357. NeedIBDevice: false,
  358. IsMainRole: false,
  359. UseNNI: false,
  360. },
  361. },
  362. Volumes: []models.Volume{
  363. {
  364. HostPath: models.StHostPath{
  365. Path: minioCodePath,
  366. MountPath: CodeMountPath,
  367. ReadOnly: false,
  368. },
  369. },
  370. {
  371. HostPath: models.StHostPath{
  372. Path: dataActualPath,
  373. MountPath: DataSetMountPath,
  374. ReadOnly: true,
  375. },
  376. },
  377. {
  378. HostPath: models.StHostPath{
  379. Path: minioModelPath,
  380. MountPath: ModelMountPath,
  381. ReadOnly: false,
  382. },
  383. },
  384. },
  385. })
  386. if err != nil {
  387. log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
  388. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  389. return err
  390. }
  391. if jobResult.Code != Success {
  392. log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"])
  393. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  394. return errors.New(jobResult.Msg)
  395. }
  396. var jobID = jobResult.Payload["jobId"].(string)
  397. log.Info("jobId=" + jobID)
  398. models.UpdateModelConvertCBTI(modelConvert.ID, jobID)
  399. return nil
  400. }
  401. func getGpuModelConvertCommand(name string, modelFile string, modelConvert *models.AiModelConvert, bootfile string) string {
  402. var command string
  403. inputshape := strings.Split(modelConvert.InputShape, ",")
  404. n := "256"
  405. c := "1"
  406. h := "28"
  407. w := "28"
  408. if len(inputshape) == 4 {
  409. n = inputshape[0]
  410. c = inputshape[1]
  411. h = inputshape[2]
  412. w = inputshape[3]
  413. }
  414. command += "python3 /code/" + bootfile + " --model " + modelFile + " --n " + n + " --c " + c + " --h " + h + " --w " + w
  415. if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  416. if modelConvert.NetOutputFormat == NetOutputFormat_FP16 {
  417. command += " --fp16 True"
  418. } else {
  419. command += " --fp16 False"
  420. }
  421. }
  422. command += " > " + ModelMountPath + "/" + name + "-" + LogFile
  423. return command
  424. }
  425. func DeleteModelConvert(ctx *context.Context) {
  426. log.Info("delete model convert start.")
  427. id := ctx.Params(":id")
  428. err := models.DeleteModelConvertById(id)
  429. if err != nil {
  430. ctx.JSON(500, err.Error())
  431. } else {
  432. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  433. }
  434. }
  435. func StopModelConvert(ctx *context.Context) {
  436. id := ctx.Params(":id")
  437. log.Info("stop model convert start.id=" + id)
  438. job, err := models.QueryModelConvertById(id)
  439. if err != nil {
  440. ctx.ServerError("Not found task.", err)
  441. return
  442. }
  443. if job.IsGpuTrainTask() {
  444. err = cloudbrain.StopJob(job.CloudBrainTaskId)
  445. if err != nil {
  446. log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err)
  447. }
  448. } else {
  449. _, err = modelarts.StopTrainJob(job.CloudBrainTaskId, job.ModelArtsVersionId)
  450. if err != nil {
  451. log.Error("Stop modelarts Job(%s) failed:%v", job.CloudBrainTaskId, err)
  452. }
  453. }
  454. job.Status = string(models.JobStopped)
  455. if job.EndTime == 0 {
  456. job.EndTime = timeutil.TimeStampNow()
  457. }
  458. models.ModelConvertSetDuration(job)
  459. err = models.UpdateModelConvert(job)
  460. if err != nil {
  461. log.Error("UpdateModelConvert failed:", err)
  462. }
  463. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  464. }
  465. func ShowModelConvertInfo(ctx *context.Context) {
  466. ctx.Data["ID"] = ctx.Query("ID")
  467. ctx.Data["isModelManage"] = true
  468. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  469. job, err := models.QueryModelConvertById(ctx.Query("ID"))
  470. if err == nil {
  471. ctx.Data["task"] = job
  472. } else {
  473. ctx.ServerError("Not found task.", err)
  474. return
  475. }
  476. ctx.Data["Name"] = job.Name
  477. ctx.Data["canDownload"] = isOper(ctx, job.UserId)
  478. user, err := models.GetUserByID(job.UserId)
  479. if err == nil {
  480. job.UserName = user.Name
  481. job.UserRelAvatarLink = user.RelAvatarLink()
  482. }
  483. if job.IsGpuTrainTask() {
  484. ctx.Data["npu_display"] = "none"
  485. ctx.Data["gpu_display"] = "block"
  486. result, err := cloudbrain.GetJob(job.CloudBrainTaskId)
  487. if err != nil {
  488. log.Info("error:" + err.Error())
  489. ctx.Data["error"] = err.Error()
  490. return
  491. }
  492. if result != nil {
  493. jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
  494. ctx.Data["result"] = jobRes
  495. taskRoles := jobRes.TaskRoles
  496. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  497. ctx.Data["taskRes"] = taskRes
  498. ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
  499. ctx.Data["AppExitDiagnostics"] = jobRes.JobStatus.AppExitDiagnostics
  500. job.Status = jobRes.JobStatus.State
  501. if jobRes.JobStatus.State != string(models.JobWaiting) && jobRes.JobStatus.State != string(models.JobFailed) {
  502. job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  503. job.ContainerID = taskRes.TaskStatuses[0].ContainerID
  504. job.Status = taskRes.TaskStatuses[0].State
  505. }
  506. if jobRes.JobStatus.State != string(models.JobWaiting) {
  507. models.ModelComputeAndSetDuration(job, jobRes)
  508. err = models.UpdateModelConvert(job)
  509. if err != nil {
  510. log.Error("UpdateModelConvert failed:", err)
  511. }
  512. }
  513. }
  514. } else {
  515. result, err := modelarts.GetTrainJob(job.CloudBrainTaskId, job.ModelArtsVersionId)
  516. if err != nil {
  517. log.Info("error:" + err.Error())
  518. ctx.Data["error"] = err.Error()
  519. return
  520. }
  521. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  522. job.RunTime = result.Duration / 1000
  523. job.TrainJobDuration = models.ConvertDurationToStr(job.RunTime)
  524. err = models.UpdateModelConvert(job)
  525. if err != nil {
  526. log.Error("UpdateJob failed:", err)
  527. }
  528. ctx.Data["npu_display"] = "block"
  529. ctx.Data["gpu_display"] = "none"
  530. ctx.Data["ExitDiagnostics"] = ""
  531. ctx.Data["AppExitDiagnostics"] = ""
  532. }
  533. ctx.HTML(200, tplModelConvertInfo)
  534. }
  535. func ConvertModelTemplate(ctx *context.Context) {
  536. ctx.Data["isModelManage"] = true
  537. ctx.Data["TRAIN_COUNT"] = 0
  538. SetModelCount(ctx)
  539. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  540. ShowModelConvertPageInfo(ctx)
  541. ctx.HTML(200, tplModelManageConvertIndex)
  542. }
  543. func ShowModelConvertPageInfo(ctx *context.Context) {
  544. log.Info("ShowModelConvertInfo start.")
  545. if !isQueryRight(ctx) {
  546. log.Info("no right.")
  547. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  548. return
  549. }
  550. page := ctx.QueryInt("page")
  551. if page <= 0 {
  552. page = 1
  553. }
  554. pageSize := ctx.QueryInt("pageSize")
  555. if pageSize <= 0 {
  556. pageSize = setting.UI.IssuePagingNum
  557. }
  558. repoId := ctx.Repo.Repository.ID
  559. modelResult, count, err := models.QueryModelConvert(&models.AiModelQueryOptions{
  560. ListOptions: models.ListOptions{
  561. Page: page,
  562. PageSize: pageSize,
  563. },
  564. RepoID: repoId,
  565. })
  566. if err != nil {
  567. log.Info("query db error." + err.Error())
  568. ctx.ServerError("Cloudbrain", err)
  569. return
  570. }
  571. ctx.Data["MODEL_CONVERT_COUNT"] = count
  572. userIds := make([]int64, len(modelResult))
  573. for i, model := range modelResult {
  574. model.IsCanOper = isOper(ctx, model.UserId)
  575. model.IsCanDelete = isCanDelete(ctx, model.UserId)
  576. userIds[i] = model.UserId
  577. }
  578. userNameMap := queryUserName(userIds)
  579. for _, model := range modelResult {
  580. value := userNameMap[model.UserId]
  581. if value != nil {
  582. model.UserName = value.Name
  583. model.UserRelAvatarLink = value.RelAvatarLink()
  584. }
  585. }
  586. pager := context.NewPagination(int(count), page, pageSize, 5)
  587. ctx.Data["Page"] = pager
  588. ctx.Data["Tasks"] = modelResult
  589. }
  590. func ModelConvertDownloadModel(ctx *context.Context) {
  591. log.Info("enter here......")
  592. id := ctx.Params(":id")
  593. job, err := models.QueryModelConvertById(id)
  594. if err != nil {
  595. ctx.ServerError("Not found task.", err)
  596. return
  597. }
  598. AllDownload := ctx.QueryBool("AllDownload")
  599. if AllDownload {
  600. if job.IsGpuTrainTask() {
  601. path := setting.CBCodePathPrefix + job.ID + "/model/"
  602. allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, path)
  603. if err == nil {
  604. returnFileName := job.Name + ".zip"
  605. MinioDownloadManyFile(path, ctx, returnFileName, allFile)
  606. } else {
  607. log.Info("error,msg=" + err.Error())
  608. ctx.ServerError("no file to download.", err)
  609. }
  610. } else {
  611. Prefix := path.Join(setting.TrainJobModelPath, job.ID, "output/", "V0001", "") + "/"
  612. log.Info("bucket=" + setting.Bucket + "prefix=" + Prefix)
  613. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, Prefix)
  614. if err == nil {
  615. returnFileName := job.Name + ".zip"
  616. ObsDownloadManyFile(Prefix, ctx, returnFileName, allFile)
  617. } else {
  618. log.Info("error,msg=" + err.Error())
  619. ctx.ServerError("no file to download.", err)
  620. }
  621. }
  622. } else {
  623. parentDir := ctx.Query("parentDir")
  624. fileName := ctx.Query("fileName")
  625. jobName := ctx.Query("jobName")
  626. if job.IsGpuTrainTask() {
  627. filePath := "jobs/" + jobName + "/model/" + parentDir
  628. url, err := storage.Attachments.PresignedGetURL(filePath, fileName)
  629. if err != nil {
  630. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  631. ctx.ServerError("PresignedGetURL", err)
  632. return
  633. }
  634. //ctx.JSON(200, url)
  635. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  636. } else {
  637. ObjectKey := path.Join(setting.TrainJobModelPath, job.ID, "output/", "V0001", parentDir, fileName)
  638. log.Info("ObjectKey=" + ObjectKey)
  639. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, ObjectKey)
  640. if err != nil {
  641. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  642. ctx.ServerError("GetObsCreateSignedUrl", err)
  643. return
  644. }
  645. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  646. }
  647. }
  648. }