You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 43 kB


  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/grampus"
  20. "code.gitea.io/gitea/modules/log"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/storage"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "code.gitea.io/gitea/services/cloudbrain/resource"
  27. "code.gitea.io/gitea/services/reward/point/account"
  28. uuid "github.com/satori/go.uuid"
  29. )
  30. const (
  31. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  32. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  33. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  34. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  35. tplModelSafetyTestShow = "repo/modelsafety/show"
  36. )
  37. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  38. log.Info("start to create CloudBrainAiSafetyCreate")
  39. uuid := uuid.NewV4()
  40. id := uuid.String()
  41. seriaNoParas := ctx.Query("serialNo")
  42. fileName := ctx.Query("fileName")
  43. //if jobType == string(models.JobTypeBenchmark) {
  44. req := aisafety.TaskReq{
  45. UnionId: id,
  46. EvalName: "test1",
  47. EvalContent: "test1",
  48. TLPath: "test1",
  49. Indicators: []string{"ACC", "ASS"},
  50. CDName: "CIFAR10_1000_FGSM",
  51. BDName: "CIFAR10_1000基础数据集",
  52. }
  53. aisafety.GetAlgorithmList()
  54. if seriaNoParas != "" {
  55. aisafety.GetTaskStatus(seriaNoParas)
  56. } else {
  57. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  58. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  59. if err == nil {
  60. log.Info("serialNo=" + serialNo)
  61. time.Sleep(time.Duration(2) * time.Second)
  62. aisafety.GetTaskStatus(serialNo)
  63. } else {
  64. log.Info("CreateSafetyTask error," + err.Error())
  65. }
  66. }
  67. }
  68. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  69. if job == nil {
  70. log.Error("GetCloudbrainByJobID failed")
  71. return
  72. }
  73. syncAiSafetyTaskStatus(job)
  74. }
  75. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  76. ctx.Data["id"] = ctx.Params(":jobid")
  77. ctx.Data["PageIsCloudBrain"] = true
  78. ctx.HTML(200, tplModelSafetyTestShow)
  79. }
  80. func GetAiSafetyTask(ctx *context.Context) {
  81. var ID = ctx.Params(":jobid")
  82. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  83. if err != nil {
  84. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  85. return
  86. }
  87. syncAiSafetyTaskStatus(job)
  88. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  89. job.BenchmarkType = "安全评测"
  90. job.BenchmarkTypeName = "Image Classification"
  91. job.CanModify = cloudbrain.CanModifyJob(ctx, job)
  92. job.CanDel = cloudbrain.CanDeleteJob(ctx, job)
  93. s, err := resource.GetCloudbrainSpec(job.ID)
  94. if err == nil {
  95. job.Spec = s
  96. }
  97. user, err := models.GetUserByID(job.UserID)
  98. if err == nil {
  99. tmpUser := &models.User{
  100. Name: user.Name,
  101. }
  102. job.User = tmpUser
  103. }
  104. ctx.JSON(200, job)
  105. }
  106. func StopAiSafetyTask(ctx *context.Context) {
  107. log.Info("start to stop the task.")
  108. var ID = ctx.Params(":jobid")
  109. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  110. result := make(map[string]interface{})
  111. result["code"] = -1
  112. if err != nil {
  113. log.Info("query task error.err=" + err.Error())
  114. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  115. result["msg"] = "No such task."
  116. ctx.JSON(200, result)
  117. return
  118. }
  119. if isTaskNotFinished(task.Status) {
  120. if task.Type == models.TypeCloudBrainTwo {
  121. log.Info("start to stop model arts task.")
  122. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  123. if err != nil {
  124. log.Info("stop failed.err=" + err.Error())
  125. }
  126. task.Status = string(models.JobStopped)
  127. if task.EndTime == 0 {
  128. task.EndTime = timeutil.TimeStampNow()
  129. }
  130. task.ComputeAndSetDuration()
  131. err = models.UpdateJob(task)
  132. if err != nil {
  133. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  134. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  135. ctx.JSON(200, result)
  136. return
  137. }
  138. //queryTaskStatusFromCloudbrainTwo(job)
  139. } else if task.Type == models.TypeCloudBrainOne {
  140. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  141. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  142. result["msg"] = "cloudbrain.Already_stopped"
  143. ctx.JSON(200, result)
  144. return
  145. }
  146. err := cloudbrain.StopJob(task.JobID)
  147. if err != nil {
  148. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  149. result["msg"] = "cloudbrain.Stopped_failed"
  150. ctx.JSON(200, result)
  151. return
  152. }
  153. task.Status = string(models.JobStopped)
  154. if task.EndTime == 0 {
  155. task.EndTime = timeutil.TimeStampNow()
  156. }
  157. task.ComputeAndSetDuration()
  158. err = models.UpdateJob(task)
  159. if err != nil {
  160. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  161. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  162. ctx.JSON(200, result)
  163. return
  164. }
  165. }
  166. } else {
  167. if task.Status == string(models.ModelSafetyTesting) {
  168. //修改为Failed
  169. task.Status = string(models.JobStopped)
  170. if task.EndTime == 0 {
  171. task.EndTime = timeutil.TimeStampNow()
  172. }
  173. task.ComputeAndSetDuration()
  174. err = models.UpdateJob(task)
  175. if err != nil {
  176. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  177. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  178. ctx.JSON(200, result)
  179. return
  180. }
  181. } else {
  182. log.Info("The job is finished. status=" + task.Status)
  183. }
  184. }
  185. }
  186. func DelAiSafetyTask(ctx *context.Context) {
  187. var ID = ctx.Params(":jobid")
  188. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  189. result := make(map[string]interface{})
  190. result["code"] = 1
  191. if err != nil {
  192. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  193. result["msg"] = "No such task."
  194. ctx.ServerError("No such task.", err)
  195. return
  196. }
  197. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  198. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  199. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  200. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  201. return
  202. }
  203. if task.Type == models.TypeCloudBrainOne {
  204. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  205. }
  206. err = models.DeleteJob(task)
  207. if err != nil {
  208. result["msg"] = err.Error()
  209. ctx.ServerError(err.Error(), err)
  210. return
  211. }
  212. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  213. }
  214. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  215. log.Info("start to query safety task status.")
  216. if isTaskNotFinished(job.Status) {
  217. if job.Type == models.TypeCloudBrainTwo {
  218. queryTaskStatusFromCloudbrainTwo(job)
  219. } else if job.Type == models.TypeCloudBrainOne {
  220. queryTaskStatusFromCloudbrain(job)
  221. } else if job.Type == models.TypeC2Net {
  222. queryTaskStatusFromGrampus(job)
  223. }
  224. } else {
  225. if job.Status == string(models.ModelSafetyTesting) {
  226. queryTaskStatusFromModelSafetyTestServer(job)
  227. } else {
  228. log.Info("The job is finished. status=" + job.Status)
  229. }
  230. }
  231. }
  232. func TimerHandleModelSafetyTestTask() {
  233. log.Info("start to TimerHandleModelSafetyTestTask")
  234. tasks, err := models.GetModelSafetyTestTask()
  235. if err == nil {
  236. if tasks != nil && len(tasks) > 0 {
  237. for _, job := range tasks {
  238. syncAiSafetyTaskStatus(job)
  239. }
  240. } else {
  241. log.Info("query running model safety test task 0.")
  242. }
  243. } else {
  244. log.Info("query running model safety test task err." + err.Error())
  245. }
  246. }
  247. func queryTaskStatusFromGrampus(task *models.Cloudbrain) {
  248. log.Info("The task not finished,name=" + task.DisplayJobName)
  249. if task.DeletedAt.IsZero() { //normal record
  250. result, err := grampus.GetJob(task.JobID)
  251. resultJson, _ := json.Marshal(result)
  252. log.Info("resultJson=" + string(resultJson))
  253. if err != nil {
  254. log.Error("GetJob failed:" + err.Error())
  255. return
  256. }
  257. if result != nil {
  258. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  259. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  260. }
  261. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  262. if task.Status != models.GrampusStatusSucceeded {
  263. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  264. task.Duration = result.JobInfo.RunSec
  265. if task.Duration < 0 {
  266. task.Duration = 0
  267. }
  268. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  269. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  270. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  271. }
  272. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  273. task.EndTime = task.StartTime.Add(task.Duration)
  274. }
  275. task.CorrectCreateUnix()
  276. err = models.UpdateJob(task)
  277. if err != nil {
  278. log.Error("UpdateJob failed:" + err.Error())
  279. }
  280. }
  281. } else {
  282. task.Status = string(models.ModelSafetyTesting)
  283. err = models.UpdateJob(task)
  284. if err != nil {
  285. log.Error("UpdateJob failed:", err)
  286. }
  287. //send msg to beihang
  288. sendGPUInferenceResultToTest(task)
  289. }
  290. }
  291. }
  292. }
  293. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  294. log.Info("The task not finished,name=" + job.DisplayJobName)
  295. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  296. if err != nil {
  297. log.Info("query train job error." + err.Error())
  298. return
  299. }
  300. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  301. job.Duration = result.Duration / 1000
  302. job.TrainJobDuration = result.TrainJobDuration
  303. if job.StartTime == 0 && result.StartTime > 0 {
  304. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  305. }
  306. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  307. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  308. job.EndTime = job.StartTime.Add(job.Duration)
  309. }
  310. job.CorrectCreateUnix()
  311. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  312. log.Info("CloudbrainTwo task status=" + job.Status)
  313. err = models.UpdateJob(job)
  314. if err != nil {
  315. log.Error("UpdateJob failed:", err)
  316. }
  317. } else {
  318. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  319. job.Status = string(models.ModelSafetyTesting)
  320. err = models.UpdateJob(job)
  321. if err != nil {
  322. log.Error("UpdateJob failed:", err)
  323. }
  324. //send msg to beihang
  325. sendNPUInferenceResultToTest(job)
  326. }
  327. }
  328. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  329. log.Info("start to sendNPUInferenceResultToTest")
  330. datasetname := job.DatasetName
  331. datasetnames := strings.Split(datasetname, ";")
  332. indicator := job.LabelName
  333. EvalContent := "test1"
  334. if job.Description != "" {
  335. EvalContent = job.Description
  336. }
  337. req := aisafety.TaskReq{
  338. UnionId: job.JobID,
  339. EvalName: job.DisplayJobName,
  340. EvalContent: EvalContent,
  341. TLPath: "test",
  342. Indicators: strings.Split(indicator, ";"),
  343. CDName: strings.Split(datasetnames[1], ".")[0],
  344. BDName: strings.Split(datasetnames[0], ".")[0],
  345. }
  346. log.Info("CDName=" + req.CDName)
  347. log.Info("BDName=" + req.BDName)
  348. jsonContent := ""
  349. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  350. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  351. resultPath = resultPath[1:]
  352. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  353. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  354. if err != nil {
  355. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  356. } else {
  357. defer body.Close()
  358. var data []byte
  359. p := make([]byte, 4096)
  360. var readErr error
  361. var readCount int
  362. for {
  363. readCount, readErr = body.Read(p)
  364. if readCount > 0 {
  365. data = append(data, p[:readCount]...)
  366. }
  367. if readErr != nil || readCount == 0 {
  368. break
  369. }
  370. }
  371. jsonContent = string(data)
  372. }
  373. if jsonContent != "" {
  374. log.Info("start to send beihang ...")
  375. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  376. if err == nil {
  377. //update serial no to db
  378. job.PreVersionName = serialNo
  379. err = models.UpdateJob(job)
  380. if err != nil {
  381. log.Error("UpdateJob failed:", err)
  382. }
  383. }
  384. } else {
  385. log.Info("The json is null. so set it failed.")
  386. //update task failed.
  387. job.Status = string(models.ModelArtsTrainJobFailed)
  388. err := models.UpdateJob(job)
  389. if err != nil {
  390. log.Error("UpdateJob failed:", err)
  391. }
  392. }
  393. }
  394. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  395. log.Info("The task not finished,name=" + job.DisplayJobName)
  396. jobResult, err := cloudbrain.GetJob(job.JobID)
  397. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  398. if err != nil {
  399. log.Error("ConvertToJobResultPayload failed:", err)
  400. return
  401. }
  402. job.Status = result.JobStatus.State
  403. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  404. taskRoles := result.TaskRoles
  405. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  406. job.Status = taskRes.TaskStatuses[0].State
  407. }
  408. if result.JobStatus.State != string(models.JobSucceeded) {
  409. err = models.UpdateJob(job)
  410. if err != nil {
  411. log.Error("UpdateJob failed:", err)
  412. }
  413. } else {
  414. //
  415. job.Status = string(models.ModelSafetyTesting)
  416. err = models.UpdateJob(job)
  417. if err != nil {
  418. log.Error("UpdateJob failed:", err)
  419. }
  420. //send msg to beihang
  421. sendGPUInferenceResultToTest(job)
  422. }
  423. }
  424. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  425. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  426. if err == nil {
  427. if result.Code == "0" {
  428. if result.Data.Status == 1 {
  429. log.Info("The task is running....")
  430. } else {
  431. if result.Data.Code == 0 {
  432. job.ResultJson = result.Data.StandardJson
  433. job.Status = string(models.JobSucceeded)
  434. err = models.UpdateJob(job)
  435. if err != nil {
  436. log.Error("UpdateJob failed:", err)
  437. }
  438. }
  439. }
  440. } else {
  441. log.Info("The task is failed.")
  442. job.Status = string(models.JobFailed)
  443. err = models.UpdateJob(job)
  444. if err != nil {
  445. log.Error("UpdateJob failed:", err)
  446. }
  447. }
  448. } else {
  449. log.Info("The task not found.....")
  450. }
  451. }
  452. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  453. datasetname := job.DatasetName
  454. datasetnames := strings.Split(datasetname, ";")
  455. indicator := job.LabelName
  456. EvalContent := "test1"
  457. if job.Description != "" {
  458. EvalContent = job.Description
  459. }
  460. req := aisafety.TaskReq{
  461. UnionId: job.JobID,
  462. EvalName: job.DisplayJobName,
  463. EvalContent: EvalContent,
  464. TLPath: "test1",
  465. Indicators: strings.Split(indicator, ";"),
  466. CDName: datasetnames[1],
  467. BDName: datasetnames[0],
  468. }
  469. resultDir := "/model"
  470. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  471. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  472. if err != nil {
  473. log.Error("query cloudbrain one model failed: %v", err)
  474. return
  475. }
  476. jsonContent := ""
  477. for _, file := range files {
  478. if strings.HasSuffix(file.FileName, "result.json") {
  479. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  480. log.Info("path=" + path)
  481. reader, err := os.Open(path)
  482. defer reader.Close()
  483. if err == nil {
  484. r := bufio.NewReader(reader)
  485. for {
  486. line, error := r.ReadString('\n')
  487. if error == io.EOF {
  488. log.Info("read file completed.")
  489. break
  490. }
  491. if error != nil {
  492. log.Info("read file error." + error.Error())
  493. break
  494. }
  495. jsonContent += line
  496. }
  497. }
  498. break
  499. }
  500. }
  501. if jsonContent != "" {
  502. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  503. if err == nil {
  504. //update serial no to db
  505. job.PreVersionName = serialNo
  506. err = models.UpdateJob(job)
  507. if err != nil {
  508. log.Error("UpdateJob failed:", err)
  509. }
  510. }
  511. } else {
  512. log.Info("The json is null. so set it failed.")
  513. //update task failed.
  514. job.Status = string(models.JobFailed)
  515. err = models.UpdateJob(job)
  516. if err != nil {
  517. log.Error("UpdateJob failed:", err)
  518. }
  519. }
  520. }
  521. func isTaskNotFinished(status string) bool {
  522. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  523. return true
  524. }
  525. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  526. return true
  527. }
  528. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  529. return true
  530. }
  531. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  532. return true
  533. }
  534. return false
  535. }
  536. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  537. t := time.Now()
  538. ctx.Data["PageIsCloudBrain"] = true
  539. ctx.Data["IsCreate"] = true
  540. ctx.Data["type"] = models.TypeCloudBrainOne
  541. ctx.Data["compute_resource"] = models.GPUResource
  542. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  543. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  544. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  545. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  546. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  547. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  548. ctx.Data["display_job_name"] = displayJobName
  549. prepareCloudbrainOneSpecs(ctx)
  550. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  551. if queuesDetail != nil {
  552. ctx.Data["QueuesDetail"] = queuesDetail
  553. }
  554. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  555. }
  556. func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) {
  557. ctx.Data["PageIsCloudBrain"] = true
  558. ctx.Data["IsCreate"] = true
  559. ctx.Data["type"] = models.TypeC2Net
  560. ctx.Data["compute_resource"] = models.GPUResource
  561. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  562. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  563. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  564. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  565. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  566. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  567. if err != nil {
  568. ctx.ServerError("get new train-job info failed", err)
  569. return
  570. }
  571. ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu)
  572. }
  573. func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) {
  574. ctx.Data["PageIsCloudBrain"] = true
  575. ctx.Data["IsCreate"] = true
  576. ctx.Data["type"] = models.TypeC2Net
  577. ctx.Data["compute_resource"] = models.NPUResource
  578. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  579. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  580. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  581. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  582. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  583. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  584. if err != nil {
  585. ctx.ServerError("get new train-job info failed", err)
  586. return
  587. }
  588. ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu)
  589. }
  590. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  591. t := time.Now()
  592. ctx.Data["PageIsCloudBrain"] = true
  593. ctx.Data["IsCreate"] = true
  594. ctx.Data["type"] = models.TypeCloudBrainTwo
  595. ctx.Data["compute_resource"] = models.NPUResource
  596. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  597. ctx.Data["display_job_name"] = displayJobName
  598. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  599. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  600. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  601. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  602. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  603. var resourcePools modelarts.ResourcePool
  604. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  605. ctx.ServerError("json.Unmarshal failed:", err)
  606. }
  607. ctx.Data["resource_pools"] = resourcePools.Info
  608. var engines modelarts.Engine
  609. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  610. ctx.ServerError("json.Unmarshal failed:", err)
  611. }
  612. ctx.Data["engines"] = engines.Info
  613. var versionInfos modelarts.VersionInfo
  614. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  615. ctx.ServerError("json.Unmarshal failed:", err)
  616. }
  617. ctx.Data["engine_versions"] = versionInfos.Version
  618. prepareCloudbrainTwoInferenceSpecs(ctx)
  619. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  620. ctx.Data["WaitCount"] = waitCount
  621. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  622. }
  623. func AiSafetyCreateForPost(ctx *context.Context) {
  624. ctx.Data["PageIsCloudBrain"] = true
  625. displayJobName := ctx.Query("display_job_name")
  626. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  627. taskType := ctx.QueryInt("type")
  628. description := ctx.Query("description")
  629. ctx.Data["type"] = taskType
  630. ctx.Data["displayJobName"] = displayJobName
  631. ctx.Data["description"] = description
  632. repo := ctx.Repo.Repository
  633. tpname := tplCloudBrainModelSafetyNewNpu
  634. if taskType == models.TypeCloudBrainOne {
  635. tpname = tplCloudBrainModelSafetyNewGpu
  636. }
  637. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  638. if err == nil {
  639. if len(tasks) != 0 {
  640. log.Error("the job name did already exist", ctx.Data["MsgID"])
  641. modelSafetyNewDataPrepare(ctx)
  642. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  643. return
  644. }
  645. } else {
  646. if !models.IsErrJobNotExist(err) {
  647. log.Error("system error, %v", err, ctx.Data["MsgID"])
  648. modelSafetyNewDataPrepare(ctx)
  649. ctx.RenderWithErr("system error", tpname, nil)
  650. return
  651. }
  652. }
  653. if !jobNamePattern.MatchString(jobName) {
  654. modelSafetyNewDataPrepare(ctx)
  655. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  656. return
  657. }
  658. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  659. if err != nil {
  660. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  661. modelSafetyNewDataPrepare(ctx)
  662. ctx.RenderWithErr("system error", tpname, nil)
  663. return
  664. } else {
  665. if count >= 1 {
  666. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  667. modelSafetyNewDataPrepare(ctx)
  668. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  669. return
  670. }
  671. }
  672. BootFile := ctx.Query("boot_file")
  673. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  674. if err != nil || !bootFileExist {
  675. log.Error("Get bootfile error:", err)
  676. modelSafetyNewDataPrepare(ctx)
  677. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  678. return
  679. }
  680. if taskType == models.TypeCloudBrainTwo {
  681. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  682. createForNPU(ctx, jobName)
  683. } else if taskType == models.TypeCloudBrainOne {
  684. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  685. createForGPU(ctx, jobName)
  686. } else if taskType == models.TypeC2Net {
  687. ComputeResource := ctx.Query("compute_resource")
  688. if ComputeResource == models.NPUResource {
  689. createForGrampusNPU(ctx, jobName)
  690. } else if ComputeResource == models.GPUResource {
  691. createForGrampusGPU(ctx, jobName)
  692. }
  693. }
  694. log.Info("to redirect...")
  695. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  696. }
  697. func createForGrampusGPU(ctx *context.Context, jobName string) {
  698. BootFile := ctx.Query("boot_file")
  699. displayJobName := ctx.Query("display_job_name")
  700. description := ctx.Query("description")
  701. image := strings.TrimSpace(ctx.Query("image"))
  702. srcDataset := ctx.Query("src_dataset") //uuid
  703. combatDataset := ctx.Query("combat_dataset") //uuid
  704. evaluationIndex := ctx.Query("evaluation_index")
  705. Params := ctx.Query("run_para_list")
  706. specId := ctx.QueryInt64("spec_id")
  707. TrainUrl := ctx.Query("train_url")
  708. CkptName := ctx.Query("ckpt_name")
  709. ModelName := ctx.Query("model_name")
  710. ModelVersion := ctx.Query("model_version")
  711. repo := ctx.Repo.Repository
  712. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  713. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  714. //check specification
  715. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  716. JobType: models.JobTypeTrain,
  717. ComputeResource: models.GPU,
  718. Cluster: models.C2NetCluster,
  719. })
  720. if err != nil || spec == nil {
  721. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  722. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  723. return
  724. }
  725. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  726. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  727. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  728. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  729. return
  730. }
  731. //check dataset
  732. uuid := srcDataset + ";" + combatDataset
  733. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  734. if err != nil {
  735. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  736. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  737. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  738. return
  739. }
  740. //prepare code and out path
  741. _, err = ioutil.ReadDir(codeLocalPath)
  742. if err == nil {
  743. os.RemoveAll(codeLocalPath)
  744. }
  745. if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  746. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  747. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  748. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  749. return
  750. }
  751. //todo: upload code (send to file_server todo this work?)
  752. //upload code
  753. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  754. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  755. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  756. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  757. return
  758. }
  759. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  760. if err := mkModelPath(modelPath); err != nil {
  761. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  762. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  763. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  764. return
  765. }
  766. //init model readme
  767. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  768. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  769. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  770. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  771. return
  772. }
  773. var datasetRemotePath, allFileName string
  774. for _, datasetInfo := range datasetInfos {
  775. if datasetRemotePath == "" {
  776. datasetRemotePath = datasetInfo.DataLocalPath
  777. allFileName = datasetInfo.FullName
  778. } else {
  779. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  780. allFileName = allFileName + ";" + datasetInfo.FullName
  781. }
  782. }
  783. //prepare command
  784. preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName)
  785. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName)
  786. if err != nil {
  787. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  788. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  789. ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  790. return
  791. }
  792. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  793. req := &grampus.GenerateTrainJobReq{
  794. JobName: jobName,
  795. DisplayJobName: displayJobName,
  796. ComputeResource: models.GPUResource,
  797. ProcessType: grampus.ProcessorTypeGPU,
  798. Command: command,
  799. ImageUrl: image,
  800. Description: description,
  801. BootFile: BootFile,
  802. Uuid: uuid,
  803. CommitID: commitID,
  804. BranchName: cloudbrain.DefaultBranchName,
  805. Params: Params,
  806. EngineName: image,
  807. DatasetNames: datasetNames,
  808. DatasetInfos: datasetInfos,
  809. IsLatestVersion: modelarts.IsLatestVersion,
  810. VersionCount: modelarts.VersionCountOne,
  811. WorkServerNumber: 1,
  812. Spec: spec,
  813. ModelName: ModelName,
  814. LabelName: evaluationIndex,
  815. CkptName: CkptName,
  816. ModelVersion: ModelVersion,
  817. PreTrainModelUrl: TrainUrl,
  818. }
  819. err = grampus.GenerateTrainJob(ctx, req)
  820. if err != nil {
  821. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  822. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  823. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  824. return
  825. }
  826. }
  827. func createForGrampusNPU(ctx *context.Context, jobName string) {
  828. }
  829. func createForNPU(ctx *context.Context, jobName string) {
  830. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  831. BootFile := ctx.Query("boot_file")
  832. displayJobName := ctx.Query("display_job_name")
  833. description := ctx.Query("description")
  834. srcDataset := ctx.Query("src_dataset") //uuid
  835. combatDataset := ctx.Query("combat_dataset") //uuid
  836. evaluationIndex := ctx.Query("evaluation_index")
  837. Params := ctx.Query("run_para_list")
  838. specId := ctx.QueryInt64("spec_id")
  839. engineID := ctx.QueryInt("engine_id")
  840. log.Info("engine_id=" + fmt.Sprint(engineID))
  841. poolID := ctx.Query("pool_id")
  842. repo := ctx.Repo.Repository
  843. trainUrl := ctx.Query("train_url")
  844. modelName := ctx.Query("model_name")
  845. modelVersion := ctx.Query("model_version")
  846. ckptName := ctx.Query("ckpt_name")
  847. ckptUrl := "/" + trainUrl + ckptName
  848. log.Info("ckpt url:" + ckptUrl)
  849. FlavorName := ctx.Query("flaver_names")
  850. EngineName := ctx.Query("engine_names")
  851. isLatestVersion := modelarts.IsLatestVersion
  852. VersionCount := modelarts.VersionCountOne
  853. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  854. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  855. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  856. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  857. log.Info("ckpt url:" + ckptUrl)
  858. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  859. JobType: models.JobTypeInference,
  860. ComputeResource: models.NPU,
  861. Cluster: models.OpenICluster,
  862. AiCenterCode: models.AICenterOfCloudBrainTwo})
  863. if err != nil || spec == nil {
  864. modelSafetyNewDataPrepare(ctx)
  865. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  866. return
  867. }
  868. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  869. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  870. modelSafetyNewDataPrepare(ctx)
  871. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  872. return
  873. }
  874. //todo: del the codeLocalPath
  875. _, err = ioutil.ReadDir(codeLocalPath)
  876. if err == nil {
  877. os.RemoveAll(codeLocalPath)
  878. }
  879. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  880. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  881. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  882. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  883. modelSafetyNewDataPrepare(ctx)
  884. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  885. return
  886. }
  887. //todo: upload code (send to file_server todo this work?)
  888. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  889. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  890. modelSafetyNewDataPrepare(ctx)
  891. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  892. return
  893. }
  894. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  895. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  896. modelSafetyNewDataPrepare(ctx)
  897. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  898. return
  899. }
  900. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  901. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  902. modelSafetyNewDataPrepare(ctx)
  903. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  904. return
  905. }
  906. var parameters models.Parameters
  907. param := make([]models.Parameter, 0)
  908. param = append(param, models.Parameter{
  909. Label: modelarts.ResultUrl,
  910. Value: "s3:/" + resultObsPath,
  911. }, models.Parameter{
  912. Label: modelarts.CkptUrl,
  913. Value: "s3:/" + ckptUrl,
  914. })
  915. uuid := srcDataset + ";" + combatDataset
  916. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  917. if err != nil {
  918. modelSafetyNewDataPrepare(ctx)
  919. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  920. return
  921. }
  922. dataPath := dataUrl
  923. jsondatas, err := json.Marshal(datasUrlList)
  924. if err != nil {
  925. log.Error("Failed to Marshal: %v", err)
  926. modelSafetyNewDataPrepare(ctx)
  927. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  928. return
  929. }
  930. if isMultiDataset {
  931. param = append(param, models.Parameter{
  932. Label: modelarts.MultiDataUrl,
  933. Value: string(jsondatas),
  934. })
  935. }
  936. existDeviceTarget := false
  937. if len(Params) != 0 {
  938. err := json.Unmarshal([]byte(Params), &parameters)
  939. if err != nil {
  940. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  941. modelSafetyNewDataPrepare(ctx)
  942. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  943. return
  944. }
  945. for _, parameter := range parameters.Parameter {
  946. if parameter.Label == modelarts.DeviceTarget {
  947. existDeviceTarget = true
  948. }
  949. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  950. param = append(param, models.Parameter{
  951. Label: parameter.Label,
  952. Value: parameter.Value,
  953. })
  954. }
  955. }
  956. }
  957. if !existDeviceTarget {
  958. param = append(param, models.Parameter{
  959. Label: modelarts.DeviceTarget,
  960. Value: modelarts.Ascend,
  961. })
  962. }
  963. req := &modelarts.GenerateInferenceJobReq{
  964. JobName: jobName,
  965. DisplayJobName: displayJobName,
  966. DataUrl: dataPath,
  967. Description: description,
  968. CodeObsPath: codeObsPath,
  969. BootFileUrl: codeObsPath + BootFile,
  970. BootFile: BootFile,
  971. TrainUrl: trainUrl,
  972. WorkServerNumber: 1,
  973. EngineID: int64(engineID),
  974. LogUrl: logObsPath,
  975. PoolID: poolID,
  976. Uuid: uuid,
  977. Parameters: param, //modelarts train parameters
  978. CommitID: commitID,
  979. BranchName: cloudbrain.DefaultBranchName,
  980. Params: Params,
  981. FlavorName: FlavorName,
  982. EngineName: EngineName,
  983. LabelName: evaluationIndex,
  984. IsLatestVersion: isLatestVersion,
  985. VersionCount: VersionCount,
  986. TotalVersionCount: modelarts.TotalVersionCount,
  987. ModelName: modelName,
  988. ModelVersion: modelVersion,
  989. CkptName: ckptName,
  990. ResultUrl: resultObsPath,
  991. Spec: spec,
  992. DatasetName: datasetNames,
  993. JobType: string(models.JobTypeModelSafety),
  994. }
  995. err = modelarts.GenerateInferenceJob(ctx, req)
  996. if err != nil {
  997. log.Error("GenerateTrainJob failed:%v", err.Error())
  998. modelSafetyNewDataPrepare(ctx)
  999. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  1000. return
  1001. }
  1002. }
  1003. func createForGPU(ctx *context.Context, jobName string) {
  1004. BootFile := ctx.Query("boot_file")
  1005. displayJobName := ctx.Query("display_job_name")
  1006. description := ctx.Query("description")
  1007. image := strings.TrimSpace(ctx.Query("image"))
  1008. srcDataset := ctx.Query("src_dataset") //uuid
  1009. combatDataset := ctx.Query("combat_dataset") //uuid
  1010. evaluationIndex := ctx.Query("evaluation_index")
  1011. Params := ctx.Query("run_para_list")
  1012. specId := ctx.QueryInt64("spec_id")
  1013. TrainUrl := ctx.Query("train_url")
  1014. CkptName := ctx.Query("ckpt_name")
  1015. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  1016. log.Info("ckpt url:" + ckptUrl)
  1017. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  1018. JobType: models.JobTypeBenchmark,
  1019. ComputeResource: models.GPU,
  1020. Cluster: models.OpenICluster,
  1021. AiCenterCode: models.AICenterOfCloudBrainOne})
  1022. if err != nil || spec == nil {
  1023. modelSafetyNewDataPrepare(ctx)
  1024. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  1025. return
  1026. }
  1027. repo := ctx.Repo.Repository
  1028. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  1029. os.RemoveAll(codePath)
  1030. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  1031. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  1032. modelSafetyNewDataPrepare(ctx)
  1033. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  1034. return
  1035. }
  1036. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  1037. if err != nil {
  1038. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  1039. modelSafetyNewDataPrepare(ctx)
  1040. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  1041. return
  1042. }
  1043. uuid := srcDataset + ";" + combatDataset
  1044. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  1045. log.Info("uuid=" + uuid)
  1046. if err != nil {
  1047. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  1048. modelSafetyNewDataPrepare(ctx)
  1049. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  1050. return
  1051. }
  1052. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  1053. if err != nil {
  1054. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  1055. modelSafetyNewDataPrepare(ctx)
  1056. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  1057. return
  1058. }
  1059. log.Info("Command=" + command)
  1060. req := cloudbrain.GenerateCloudBrainTaskReq{
  1061. Ctx: ctx,
  1062. DisplayJobName: displayJobName,
  1063. JobName: jobName,
  1064. Image: image,
  1065. Command: command,
  1066. Uuids: uuid,
  1067. DatasetNames: datasetNames,
  1068. DatasetInfos: datasetInfos,
  1069. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  1070. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  1071. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  1072. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  1073. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  1074. JobType: string(models.JobTypeModelSafety),
  1075. Description: description,
  1076. BranchName: cloudbrain.DefaultBranchName,
  1077. BootFile: BootFile,
  1078. Params: Params,
  1079. CommitID: "",
  1080. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  1081. Spec: spec,
  1082. LabelName: evaluationIndex,
  1083. }
  1084. err = cloudbrain.GenerateTask(req)
  1085. if err != nil {
  1086. modelSafetyNewDataPrepare(ctx)
  1087. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  1088. return
  1089. }
  1090. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  1091. }
  1092. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  1093. var command string
  1094. bootFile := strings.TrimSpace(BootFile)
  1095. if !strings.HasSuffix(bootFile, ".py") {
  1096. log.Error("bootFile(%s) format error", bootFile)
  1097. return command, errors.New("bootFile format error")
  1098. }
  1099. var parameters models.Parameters
  1100. var param string
  1101. if len(params) != 0 {
  1102. err := json.Unmarshal([]byte(params), &parameters)
  1103. if err != nil {
  1104. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1105. return command, err
  1106. }
  1107. for _, parameter := range parameters.Parameter {
  1108. param += " --" + parameter.Label + "=" + parameter.Value
  1109. }
  1110. }
  1111. param += " --modelname" + "=" + CkptName
  1112. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  1113. return command, nil
  1114. }
  1115. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  1116. ctx.Data["PageIsCloudBrain"] = true
  1117. ctx.Data["boot_file"] = ctx.Query("boot_file")
  1118. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  1119. ctx.Data["description"] = ctx.Query("description")
  1120. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  1121. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  1122. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  1123. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  1124. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  1125. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  1126. ctx.Data["train_url"] = ctx.Query("train_url")
  1127. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1128. ctx.Data["train_url"] = ctx.Query("train_url")
  1129. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1130. ctx.Data["model_name"] = ctx.Query("model_name")
  1131. ctx.Data["model_version"] = ctx.Query("model_version")
  1132. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  1133. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  1134. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  1135. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  1136. prepareCloudbrainOneSpecs(ctx)
  1137. prepareCloudbrainTwoInferenceSpecs(ctx)
  1138. return nil
  1139. }
  1140. func getJsonContent(url string) (string, error) {
  1141. resp, err := http.Get(url)
  1142. if err != nil || resp.StatusCode != 200 {
  1143. log.Info("Get organizations url error=" + err.Error())
  1144. return "", err
  1145. }
  1146. bytes, err := ioutil.ReadAll(resp.Body)
  1147. resp.Body.Close()
  1148. if err != nil {
  1149. log.Info("Get organizations url error=" + err.Error())
  1150. return "", err
  1151. }
  1152. str := string(bytes)
  1153. //log.Info("json str =" + str)
  1154. return str, nil
  1155. }