You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ai_model_convert.go 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strings"
  12. "code.gitea.io/gitea/models"
  13. "code.gitea.io/gitea/modules/cloudbrain"
  14. "code.gitea.io/gitea/modules/context"
  15. "code.gitea.io/gitea/modules/git"
  16. "code.gitea.io/gitea/modules/log"
  17. "code.gitea.io/gitea/modules/modelarts"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/storage"
  20. "code.gitea.io/gitea/modules/timeutil"
  21. uuid "github.com/satori/go.uuid"
  22. )
  23. const (
  24. tplModelManageConvertIndex = "repo/modelmanage/convertIndex"
  25. tplModelConvertInfo = "repo/modelmanage/convertshowinfo"
  26. PYTORCH_ENGINE = 0
  27. TENSORFLOW_ENGINE = 1
  28. MINDSPORE_ENGIN = 2
  29. ModelMountPath = "/model"
  30. CodeMountPath = "/code"
  31. DataSetMountPath = "/dataset"
  32. LogFile = "log.txt"
  33. DefaultBranchName = "master"
  34. SubTaskName = "task1"
  35. GpuQueue = "openidgx"
  36. Success = "S000"
  37. GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap"
  38. GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx"
  39. PytorchOnnxBootFile = "convert_pytorch.py"
  40. PytorchTrTBootFile = "convert_pytorch_tensorrt.py"
  41. MindsporeBootFile = "convert_mindspore.py"
  42. TensorFlowNpuBootFile = "convert_tensorflow.py"
  43. TensorFlowGpuBootFile = "convert_tensorflow_gpu.py"
  44. ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test"
  45. REPO_ID = 33267
  46. CONVERT_FORMAT_ONNX = 0
  47. CONVERT_FORMAT_TRT = 1
  48. NetOutputFormat_FP32 = 0
  49. NetOutputFormat_FP16 = 1
  50. NPU_MINDSPORE_IMAGE_ID = 35
  51. NPU_TENSORFLOW_IMAGE_ID = 121
  52. GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1
  53. NPU_FlavorCode = "modelarts.bm.910.arm.public.1"
  54. NPU_PoolID = "pool7908321a"
  55. )
  56. var (
  57. TrainResourceSpecs *models.ResourceSpecs
  58. )
  59. func SaveModelConvert(ctx *context.Context) {
  60. log.Info("save model convert start.")
  61. if !ctx.Repo.CanWrite(models.UnitTypeModelManage) {
  62. ctx.JSON(403, ctx.Tr("repo.model_noright"))
  63. return
  64. }
  65. name := ctx.Query("name")
  66. desc := ctx.Query("desc")
  67. modelId := ctx.Query("modelId")
  68. modelPath := ctx.Query("ModelFile")
  69. SrcEngine := ctx.QueryInt("SrcEngine")
  70. InputShape := ctx.Query("inputshape")
  71. InputDataFormat := ctx.Query("inputdataformat")
  72. DestFormat := ctx.QueryInt("DestFormat")
  73. NetOutputFormat := ctx.QueryInt("NetOutputFormat")
  74. task, err := models.QueryModelById(modelId)
  75. if err != nil {
  76. log.Error("no such model!", err.Error())
  77. ctx.ServerError("no such model:", err)
  78. return
  79. }
  80. uuid := uuid.NewV4()
  81. id := uuid.String()
  82. modelConvert := &models.AiModelConvert{
  83. ID: id,
  84. Name: name,
  85. Description: desc,
  86. Status: string(models.JobWaiting),
  87. SrcEngine: SrcEngine,
  88. RepoId: ctx.Repo.Repository.ID,
  89. ModelName: task.Name,
  90. ModelVersion: task.Version,
  91. ModelId: modelId,
  92. ModelPath: modelPath,
  93. DestFormat: DestFormat,
  94. NetOutputFormat: NetOutputFormat,
  95. InputShape: InputShape,
  96. InputDataFormat: InputDataFormat,
  97. UserId: ctx.User.ID,
  98. }
  99. models.SaveModelConvert(modelConvert)
  100. go goCreateTask(modelConvert, ctx, task)
  101. ctx.JSON(200, map[string]string{
  102. "result_code": "0",
  103. })
  104. }
  105. func goCreateTask(modelConvert *models.AiModelConvert, ctx *context.Context, task *models.AiModelManage) error {
  106. if modelConvert.IsGpuTrainTask() {
  107. log.Info("create gpu train job.")
  108. return createGpuTrainJob(modelConvert, ctx, task)
  109. } else {
  110. //create npu job
  111. log.Info("create npu train job.")
  112. return createNpuTrainJob(modelConvert, ctx, task.Path)
  113. }
  114. }
  115. func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error {
  116. VersionOutputPath := "V0001"
  117. codeLocalPath := setting.JobPath + modelConvert.ID + modelarts.CodePath
  118. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.CodePath
  119. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"
  120. logObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"
  121. dataPath := "/" + modelRelativePath
  122. _, err := ioutil.ReadDir(codeLocalPath)
  123. if err == nil {
  124. os.RemoveAll(codeLocalPath)
  125. }
  126. if err := downloadConvertCode(ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil {
  127. log.Error("downloadCode failed, server timed out: %s (%v)", ConvertRepoPath, err)
  128. return err
  129. }
  130. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  131. log.Error("Failed to obsMkdir_output: %s (%v)", modelConvert.ID+modelarts.OutputPath, err)
  132. return err
  133. }
  134. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  135. log.Error("Failed to obsMkdir_log: %s (%v)", modelConvert.ID+modelarts.LogPath, err)
  136. return err
  137. }
  138. if err := uploadCodeToObs(codeLocalPath, modelConvert.ID, ""); err != nil {
  139. log.Error("Failed to uploadCodeToObs: %s (%v)", modelConvert.ID, err)
  140. return err
  141. }
  142. intputshape := strings.Split(modelConvert.InputShape, ",")
  143. n := "256"
  144. c := "1"
  145. h := "28"
  146. w := "28"
  147. if len(intputshape) == 4 {
  148. n = intputshape[0]
  149. c = intputshape[1]
  150. h = intputshape[2]
  151. w = intputshape[3]
  152. }
  153. param := make([]models.Parameter, 0)
  154. modelPara := models.Parameter{
  155. Label: "model",
  156. Value: modelConvert.ModelPath,
  157. }
  158. param = append(param, modelPara)
  159. batchSizePara := models.Parameter{
  160. Label: "n",
  161. Value: fmt.Sprint(n),
  162. }
  163. param = append(param, batchSizePara)
  164. channelSizePara := models.Parameter{
  165. Label: "c",
  166. Value: fmt.Sprint(c),
  167. }
  168. param = append(param, channelSizePara)
  169. heightPara := models.Parameter{
  170. Label: "h",
  171. Value: fmt.Sprint(h),
  172. }
  173. param = append(param, heightPara)
  174. widthPara := models.Parameter{
  175. Label: "w",
  176. Value: fmt.Sprint(w),
  177. }
  178. param = append(param, widthPara)
  179. var engineId int64
  180. engineId = int64(NPU_MINDSPORE_IMAGE_ID)
  181. bootfile := MindsporeBootFile
  182. if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  183. engineId = int64(NPU_TENSORFLOW_IMAGE_ID)
  184. bootfile = TensorFlowNpuBootFile
  185. }
  186. userCommand := "/bin/bash /home/work/run_train.sh 's3://" + codeObsPath + "' '/code/" + bootfile + "' '/tmp/log/train.log' --'data_url'='s3://" + dataPath + "' --'train_url'='s3://" + outputObsPath + "'"
  187. req := &modelarts.GenerateTrainJobReq{
  188. JobName: modelConvert.ID,
  189. DisplayJobName: modelConvert.Name,
  190. DataUrl: dataPath,
  191. Description: modelConvert.Description,
  192. CodeObsPath: codeObsPath,
  193. BootFileUrl: codeObsPath + bootfile,
  194. BootFile: bootfile,
  195. TrainUrl: outputObsPath,
  196. FlavorCode: NPU_FlavorCode,
  197. WorkServerNumber: 1,
  198. IsLatestVersion: modelarts.IsLatestVersion,
  199. EngineID: engineId,
  200. LogUrl: logObsPath,
  201. PoolID: NPU_PoolID,
  202. Parameters: param,
  203. BranchName: DefaultBranchName,
  204. UserImageUrl: "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend",
  205. UserCommand: userCommand,
  206. }
  207. result, err := modelarts.GenerateModelConvertTrainJob(req)
  208. if err == nil {
  209. log.Info("jobId=" + fmt.Sprint(result.JobID) + " versionid=" + fmt.Sprint(result.VersionID))
  210. models.UpdateModelConvertModelArts(modelConvert.ID, fmt.Sprint(result.JobID), fmt.Sprint(result.VersionID))
  211. }
  212. return err
  213. }
  214. func downloadConvertCode(repopath string, codePath, branchName string) error {
  215. //add "file:///" prefix to make the depth valid
  216. if err := git.Clone(repopath, codePath, git.CloneRepoOptions{Branch: branchName, Depth: 1}); err != nil {
  217. log.Error("Failed to clone repository: %s (%v)", repopath, err)
  218. return err
  219. }
  220. log.Info("srcPath=" + repopath + " codePath=" + codePath)
  221. configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666)
  222. if err != nil {
  223. log.Error("open file(%s) failed:%v", codePath+"/,git/config", err)
  224. return err
  225. }
  226. defer configFile.Close()
  227. pos := int64(0)
  228. reader := bufio.NewReader(configFile)
  229. for {
  230. line, err := reader.ReadString('\n')
  231. if err != nil {
  232. if err == io.EOF {
  233. log.Error("not find the remote-url")
  234. return nil
  235. } else {
  236. log.Error("read error: %v", err)
  237. return err
  238. }
  239. }
  240. if strings.Contains(line, "url") && strings.Contains(line, ".git") {
  241. originUrl := "\turl = " + repopath + "\n"
  242. if len(line) > len(originUrl) {
  243. originUrl += strings.Repeat(" ", len(line)-len(originUrl))
  244. }
  245. bytes := []byte(originUrl)
  246. _, err := configFile.WriteAt(bytes, pos)
  247. if err != nil {
  248. log.Error("WriteAt failed:%v", err)
  249. return err
  250. }
  251. break
  252. }
  253. pos += int64(len(line))
  254. }
  255. return nil
  256. }
  257. func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error {
  258. path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/"
  259. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  260. if err == nil {
  261. _, errState := os.Stat(localPath)
  262. if errState != nil {
  263. if err = os.MkdirAll(localPath, os.ModePerm); err != nil {
  264. return err
  265. }
  266. }
  267. for _, oneFile := range allFile {
  268. if oneFile.IsDir {
  269. log.Info(" dir name:" + oneFile.FileName)
  270. } else {
  271. allFileName := localPath + "/" + oneFile.FileName
  272. index := strings.LastIndex(allFileName, "/")
  273. if index != -1 {
  274. parentDir := allFileName[0:index]
  275. if err = os.MkdirAll(parentDir, os.ModePerm); err != nil {
  276. log.Info("make dir may be error," + err.Error())
  277. }
  278. }
  279. fDest, err := os.Create(allFileName)
  280. if err != nil {
  281. log.Info("create file error, download file failed: %s\n", err.Error())
  282. return err
  283. }
  284. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  285. if err != nil {
  286. log.Info("download file failed: %s\n", err.Error())
  287. return err
  288. } else {
  289. defer body.Close()
  290. p := make([]byte, 1024)
  291. var readErr error
  292. var readCount int
  293. // 读取对象内容
  294. for {
  295. readCount, readErr = body.Read(p)
  296. if readCount > 0 {
  297. fDest.Write(p[:readCount])
  298. }
  299. if readErr != nil {
  300. break
  301. }
  302. }
  303. }
  304. }
  305. }
  306. } else {
  307. log.Info("error,msg=" + err.Error())
  308. return err
  309. }
  310. return nil
  311. }
  312. func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error {
  313. modelRelativePath := model.Path
  314. command := ""
  315. IMAGE_URL := GPU_PYTORCH_IMAGE
  316. dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath
  317. if modelConvert.SrcEngine == PYTORCH_ENGINE {
  318. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  319. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchOnnxBootFile)
  320. } else if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  321. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchTrTBootFile)
  322. } else {
  323. return errors.New("Not support the format.")
  324. }
  325. } else if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  326. IMAGE_URL = GPU_TENSORFLOW_IMAGE
  327. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  328. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile)
  329. } else {
  330. return errors.New("Not support the format.")
  331. }
  332. //如果模型在OBS上,需要下载到本地,并上传到minio中
  333. if model.Type == models.TypeCloudBrainTwo {
  334. relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset"
  335. log.Info("local dataset path:" + relatetiveModelPath)
  336. downloadFromObsToLocal(model, relatetiveModelPath)
  337. uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/")
  338. dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset"
  339. }
  340. }
  341. log.Info("dataActualPath=" + dataActualPath)
  342. log.Info("command=" + command)
  343. codePath := setting.JobPath + modelConvert.ID + CodeMountPath
  344. downloadConvertCode(ConvertRepoPath, codePath, DefaultBranchName)
  345. uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/")
  346. minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code"
  347. log.Info("minio codePath=" + minioCodePath)
  348. modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/"
  349. log.Info("local modelPath=" + modelPath)
  350. mkModelPath(modelPath)
  351. uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/")
  352. minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model"
  353. log.Info("minio model path=" + minioModelPath)
  354. if TrainResourceSpecs == nil {
  355. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  356. }
  357. resourceSpec := TrainResourceSpecs.ResourceSpec[GPU_Resource_Specs_ID]
  358. jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{
  359. JobName: modelConvert.ID,
  360. RetryCount: 1,
  361. GpuType: GpuQueue,
  362. Image: IMAGE_URL,
  363. TaskRoles: []models.TaskRole{
  364. {
  365. Name: SubTaskName,
  366. TaskNumber: 1,
  367. MinSucceededTaskCount: 1,
  368. MinFailedTaskCount: 1,
  369. CPUNumber: resourceSpec.CpuNum,
  370. GPUNumber: resourceSpec.GpuNum,
  371. MemoryMB: resourceSpec.MemMiB,
  372. ShmMB: resourceSpec.ShareMemMiB,
  373. Command: command,
  374. NeedIBDevice: false,
  375. IsMainRole: false,
  376. UseNNI: false,
  377. },
  378. },
  379. Volumes: []models.Volume{
  380. {
  381. HostPath: models.StHostPath{
  382. Path: minioCodePath,
  383. MountPath: CodeMountPath,
  384. ReadOnly: false,
  385. },
  386. },
  387. {
  388. HostPath: models.StHostPath{
  389. Path: dataActualPath,
  390. MountPath: DataSetMountPath,
  391. ReadOnly: true,
  392. },
  393. },
  394. {
  395. HostPath: models.StHostPath{
  396. Path: minioModelPath,
  397. MountPath: ModelMountPath,
  398. ReadOnly: false,
  399. },
  400. },
  401. },
  402. })
  403. if err != nil {
  404. log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
  405. return err
  406. }
  407. if jobResult.Code != Success {
  408. log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"])
  409. return errors.New(jobResult.Msg)
  410. }
  411. var jobID = jobResult.Payload["jobId"].(string)
  412. log.Info("jobId=" + jobID)
  413. models.UpdateModelConvertCBTI(modelConvert.ID, jobID)
  414. return nil
  415. }
  416. func getGpuModelConvertCommand(name string, modelFile string, modelConvert *models.AiModelConvert, bootfile string) string {
  417. var command string
  418. intputshape := strings.Split(modelConvert.InputShape, ",")
  419. n := "256"
  420. c := "1"
  421. h := "28"
  422. w := "28"
  423. if len(intputshape) == 4 {
  424. n = intputshape[0]
  425. c = intputshape[1]
  426. h = intputshape[2]
  427. w = intputshape[3]
  428. }
  429. command += "python3 /code/" + bootfile + " --model " + modelFile + " --n " + n + " --c " + c + " --h " + h + " --w " + w
  430. if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  431. if modelConvert.NetOutputFormat == NetOutputFormat_FP16 {
  432. command += " --fp16 True"
  433. } else {
  434. command += " --fp16 False"
  435. }
  436. }
  437. command += " > " + ModelMountPath + "/" + name + "-" + LogFile
  438. return command
  439. }
  440. func DeleteModelConvert(ctx *context.Context) {
  441. log.Info("delete model convert start.")
  442. id := ctx.Params(":id")
  443. err := models.DeleteModelConvertById(id)
  444. if err != nil {
  445. ctx.JSON(500, err.Error())
  446. } else {
  447. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  448. }
  449. }
  450. func StopModelConvert(ctx *context.Context) {
  451. id := ctx.Params(":id")
  452. log.Info("stop model convert start.id=" + id)
  453. job, err := models.QueryModelConvertById(id)
  454. if err != nil {
  455. ctx.ServerError("Not found task.", err)
  456. return
  457. }
  458. if job.IsGpuTrainTask() {
  459. err = cloudbrain.StopJob(job.CloudBrainTaskId)
  460. if err != nil {
  461. log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err)
  462. }
  463. } else {
  464. _, err = modelarts.StopTrainJob(job.CloudBrainTaskId, job.ModelArtsVersionId)
  465. if err != nil {
  466. log.Error("Stop modelarts Job(%s) failed:%v", job.CloudBrainTaskId, err)
  467. }
  468. }
  469. job.Status = string(models.JobStopped)
  470. if job.EndTime == 0 {
  471. job.EndTime = timeutil.TimeStampNow()
  472. }
  473. models.ModelConvertSetDuration(job)
  474. err = models.UpdateModelConvert(job)
  475. if err != nil {
  476. log.Error("UpdateModelConvert failed:", err)
  477. }
  478. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  479. }
  480. func ShowModelConvertInfo(ctx *context.Context) {
  481. ctx.Data["ID"] = ctx.Query("ID")
  482. ctx.Data["isModelManage"] = true
  483. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  484. job, err := models.QueryModelConvertById(ctx.Query("ID"))
  485. if err == nil {
  486. ctx.Data["task"] = job
  487. } else {
  488. ctx.ServerError("Not found task.", err)
  489. return
  490. }
  491. ctx.Data["Name"] = job.Name
  492. ctx.Data["canDownload"] = isOper(ctx, job.UserId)
  493. user, err := models.GetUserByID(job.UserId)
  494. if err == nil {
  495. job.UserName = user.Name
  496. job.UserRelAvatarLink = user.RelAvatarLink()
  497. }
  498. if job.IsGpuTrainTask() {
  499. ctx.Data["npu_display"] = "none"
  500. ctx.Data["gpu_display"] = "block"
  501. result, err := cloudbrain.GetJob(job.CloudBrainTaskId)
  502. if err != nil {
  503. log.Info("error:" + err.Error())
  504. ctx.Data["error"] = err.Error()
  505. return
  506. }
  507. if result != nil {
  508. jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
  509. ctx.Data["result"] = jobRes
  510. taskRoles := jobRes.TaskRoles
  511. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  512. ctx.Data["taskRes"] = taskRes
  513. ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
  514. ctx.Data["AppExitDiagnostics"] = jobRes.JobStatus.AppExitDiagnostics
  515. job.Status = jobRes.JobStatus.State
  516. if jobRes.JobStatus.State != string(models.JobWaiting) && jobRes.JobStatus.State != string(models.JobFailed) {
  517. job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  518. job.ContainerID = taskRes.TaskStatuses[0].ContainerID
  519. job.Status = taskRes.TaskStatuses[0].State
  520. }
  521. if jobRes.JobStatus.State != string(models.JobWaiting) {
  522. models.ModelComputeAndSetDuration(job, jobRes)
  523. err = models.UpdateModelConvert(job)
  524. if err != nil {
  525. log.Error("UpdateModelConvert failed:", err)
  526. }
  527. }
  528. }
  529. } else {
  530. ctx.Data["npu_display"] = "block"
  531. ctx.Data["gpu_display"] = "none"
  532. ctx.Data["ExitDiagnostics"] = ""
  533. ctx.Data["AppExitDiagnostics"] = ""
  534. }
  535. ctx.HTML(200, tplModelConvertInfo)
  536. }
  537. func ConvertModelTemplate(ctx *context.Context) {
  538. ctx.Data["isModelManage"] = true
  539. ctx.Data["MODEL_COUNT"] = 0
  540. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  541. ctx.Data["TRAIN_COUNT"] = 0
  542. ShowModelConvertPageInfo(ctx)
  543. ctx.HTML(200, tplModelManageConvertIndex)
  544. }
  545. func ShowModelConvertPageInfo(ctx *context.Context) {
  546. log.Info("ShowModelConvertInfo start.")
  547. if !isQueryRight(ctx) {
  548. log.Info("no right.")
  549. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  550. return
  551. }
  552. page := ctx.QueryInt("page")
  553. if page <= 0 {
  554. page = 1
  555. }
  556. pageSize := ctx.QueryInt("pageSize")
  557. if pageSize <= 0 {
  558. pageSize = setting.UI.IssuePagingNum
  559. }
  560. repoId := ctx.Repo.Repository.ID
  561. modelResult, count, err := models.QueryModelConvert(&models.AiModelQueryOptions{
  562. ListOptions: models.ListOptions{
  563. Page: page,
  564. PageSize: pageSize,
  565. },
  566. RepoID: repoId,
  567. })
  568. if err != nil {
  569. log.Info("query db error." + err.Error())
  570. ctx.ServerError("Cloudbrain", err)
  571. return
  572. }
  573. userIds := make([]int64, len(modelResult))
  574. for i, model := range modelResult {
  575. model.IsCanOper = isOper(ctx, model.UserId)
  576. model.IsCanDelete = isCanDelete(ctx, model.UserId)
  577. userIds[i] = model.UserId
  578. }
  579. userNameMap := queryUserName(userIds)
  580. for _, model := range modelResult {
  581. value := userNameMap[model.UserId]
  582. if value != nil {
  583. model.UserName = value.Name
  584. model.UserRelAvatarLink = value.RelAvatarLink()
  585. }
  586. }
  587. pager := context.NewPagination(int(count), page, pageSize, 5)
  588. ctx.Data["Page"] = pager
  589. ctx.Data["Tasks"] = modelResult
  590. }
  591. func ModelConvertDownloadModel(ctx *context.Context) {
  592. log.Info("enter here......")
  593. id := ctx.Params(":id")
  594. job, err := models.QueryModelConvertById(id)
  595. if err != nil {
  596. ctx.ServerError("Not found task.", err)
  597. return
  598. }
  599. AllDownload := ctx.QueryBool("AllDownload")
  600. if AllDownload {
  601. if job.IsGpuTrainTask() {
  602. path := setting.CBCodePathPrefix + job.ID + "/model/"
  603. allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, path)
  604. if err == nil {
  605. returnFileName := job.Name + ".zip"
  606. MinioDownloadManyFile(path, ctx, returnFileName, allFile)
  607. } else {
  608. log.Info("error,msg=" + err.Error())
  609. ctx.ServerError("no file to download.", err)
  610. }
  611. } else {
  612. }
  613. } else {
  614. if job.IsGpuTrainTask() {
  615. parentDir := ctx.Query("parentDir")
  616. fileName := ctx.Query("fileName")
  617. jobName := ctx.Query("jobName")
  618. filePath := "jobs/" + jobName + "/model/" + parentDir
  619. url, err := storage.Attachments.PresignedGetURL(filePath, fileName)
  620. if err != nil {
  621. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  622. ctx.ServerError("PresignedGetURL", err)
  623. return
  624. }
  625. //ctx.JSON(200, url)
  626. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  627. } else {
  628. }
  629. }
  630. }