You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 34 kB


  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/modelarts"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/util"
  25. "code.gitea.io/gitea/services/cloudbrain/resource"
  26. "code.gitea.io/gitea/services/reward/point/account"
  27. uuid "github.com/satori/go.uuid"
  28. )
  29. const (
  30. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  31. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  32. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  33. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  34. tplModelSafetyTestShow = "repo/modelsafety/show"
  35. )
  36. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  37. log.Info("start to create CloudBrainAiSafetyCreate")
  38. uuid := uuid.NewV4()
  39. id := uuid.String()
  40. seriaNoParas := ctx.Query("serialNo")
  41. fileName := ctx.Query("fileName")
  42. //if jobType == string(models.JobTypeBenchmark) {
  43. req := aisafety.TaskReq{
  44. UnionId: id,
  45. EvalName: "test1",
  46. EvalContent: "test1",
  47. TLPath: "test1",
  48. Indicators: []string{"ACC", "ASS"},
  49. CDName: "CIFAR10_1000_FGSM",
  50. BDName: "CIFAR10_1000基础数据集",
  51. }
  52. aisafety.GetAlgorithmList()
  53. if seriaNoParas != "" {
  54. aisafety.GetTaskStatus(seriaNoParas)
  55. } else {
  56. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  57. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  58. if err == nil {
  59. log.Info("serialNo=" + serialNo)
  60. time.Sleep(time.Duration(2) * time.Second)
  61. aisafety.GetTaskStatus(serialNo)
  62. } else {
  63. log.Info("CreateSafetyTask error," + err.Error())
  64. }
  65. }
  66. }
  67. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  68. if job == nil {
  69. log.Error("GetCloudbrainByJobID failed")
  70. return
  71. }
  72. syncAiSafetyTaskStatus(job)
  73. }
  74. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  75. ctx.Data["id"] = ctx.Params(":jobid")
  76. ctx.Data["PageIsCloudBrain"] = true
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. job.CanModify = cloudbrain.CanModifyJob(ctx, job)
  91. job.CanDel = cloudbrain.CanDeleteJob(ctx, job)
  92. s, err := resource.GetCloudbrainSpec(job.ID)
  93. if err == nil {
  94. job.Spec = s
  95. }
  96. user, err := models.GetUserByID(job.UserID)
  97. if err == nil {
  98. tmpUser := &models.User{
  99. Name: user.Name,
  100. }
  101. job.User = tmpUser
  102. }
  103. ctx.JSON(200, job)
  104. }
  105. func StopAiSafetyTask(ctx *context.Context) {
  106. log.Info("start to stop the task.")
  107. var ID = ctx.Params(":jobid")
  108. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  109. result := make(map[string]interface{})
  110. result["code"] = -1
  111. if err != nil {
  112. log.Info("query task error.err=" + err.Error())
  113. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  114. result["msg"] = "No such task."
  115. ctx.JSON(200, result)
  116. return
  117. }
  118. if isTaskNotFinished(task.Status) {
  119. if task.Type == models.TypeCloudBrainTwo {
  120. log.Info("start to stop model arts task.")
  121. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  122. if err != nil {
  123. log.Info("stop failed.err=" + err.Error())
  124. }
  125. task.Status = string(models.JobStopped)
  126. if task.EndTime == 0 {
  127. task.EndTime = timeutil.TimeStampNow()
  128. }
  129. task.ComputeAndSetDuration()
  130. err = models.UpdateJob(task)
  131. if err != nil {
  132. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  133. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  134. ctx.JSON(200, result)
  135. return
  136. }
  137. //queryTaskStatusFromCloudbrainTwo(job)
  138. } else if task.Type == models.TypeCloudBrainOne {
  139. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  140. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  141. result["msg"] = "cloudbrain.Already_stopped"
  142. ctx.JSON(200, result)
  143. return
  144. }
  145. err := cloudbrain.StopJob(task.JobID)
  146. if err != nil {
  147. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  148. result["msg"] = "cloudbrain.Stopped_failed"
  149. ctx.JSON(200, result)
  150. return
  151. }
  152. task.Status = string(models.JobStopped)
  153. if task.EndTime == 0 {
  154. task.EndTime = timeutil.TimeStampNow()
  155. }
  156. task.ComputeAndSetDuration()
  157. err = models.UpdateJob(task)
  158. if err != nil {
  159. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  160. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  161. ctx.JSON(200, result)
  162. return
  163. }
  164. }
  165. } else {
  166. if task.Status == string(models.ModelSafetyTesting) {
  167. //修改为Failed
  168. task.Status = string(models.JobStopped)
  169. if task.EndTime == 0 {
  170. task.EndTime = timeutil.TimeStampNow()
  171. }
  172. task.ComputeAndSetDuration()
  173. err = models.UpdateJob(task)
  174. if err != nil {
  175. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  176. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  177. ctx.JSON(200, result)
  178. return
  179. }
  180. } else {
  181. log.Info("The job is finished. status=" + task.Status)
  182. }
  183. }
  184. }
  185. func DelAiSafetyTask(ctx *context.Context) {
  186. var ID = ctx.Params(":jobid")
  187. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  188. result := make(map[string]interface{})
  189. result["code"] = 1
  190. if err != nil {
  191. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  192. result["msg"] = "No such task."
  193. ctx.ServerError("No such task.", err)
  194. return
  195. }
  196. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  197. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  198. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  199. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  200. return
  201. }
  202. if task.Type == models.TypeCloudBrainOne {
  203. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  204. }
  205. err = models.DeleteJob(task)
  206. if err != nil {
  207. result["msg"] = err.Error()
  208. ctx.ServerError(err.Error(), err)
  209. return
  210. }
  211. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  212. }
  213. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  214. log.Info("start to query safety task status.")
  215. if isTaskNotFinished(job.Status) {
  216. if job.Type == models.TypeCloudBrainTwo {
  217. queryTaskStatusFromCloudbrainTwo(job)
  218. } else if job.Type == models.TypeCloudBrainOne {
  219. queryTaskStatusFromCloudbrain(job)
  220. }
  221. } else {
  222. if job.Status == string(models.ModelSafetyTesting) {
  223. queryTaskStatusFromModelSafetyTestServer(job)
  224. } else {
  225. log.Info("The job is finished. status=" + job.Status)
  226. }
  227. }
  228. }
  229. func TimerHandleModelSafetyTestTask() {
  230. log.Info("start to TimerHandleModelSafetyTestTask")
  231. tasks, err := models.GetModelSafetyTestTask()
  232. if err == nil {
  233. if tasks != nil && len(tasks) > 0 {
  234. for _, job := range tasks {
  235. syncAiSafetyTaskStatus(job)
  236. }
  237. } else {
  238. log.Info("query running model safety test task 0.")
  239. }
  240. } else {
  241. log.Info("query running model safety test task err." + err.Error())
  242. }
  243. }
  244. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  245. log.Info("The task not finished,name=" + job.DisplayJobName)
  246. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  247. if err != nil {
  248. log.Info("query train job error." + err.Error())
  249. return
  250. }
  251. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  252. job.Duration = result.Duration / 1000
  253. job.TrainJobDuration = result.TrainJobDuration
  254. if job.StartTime == 0 && result.StartTime > 0 {
  255. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  256. }
  257. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  258. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  259. job.EndTime = job.StartTime.Add(job.Duration)
  260. }
  261. job.CorrectCreateUnix()
  262. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  263. log.Info("CloudbrainTwo task status=" + job.Status)
  264. err = models.UpdateJob(job)
  265. if err != nil {
  266. log.Error("UpdateJob failed:", err)
  267. }
  268. } else {
  269. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  270. job.Status = string(models.ModelSafetyTesting)
  271. err = models.UpdateJob(job)
  272. if err != nil {
  273. log.Error("UpdateJob failed:", err)
  274. }
  275. //send msg to beihang
  276. sendNPUInferenceResultToTest(job)
  277. }
  278. }
  279. func updateCloudBrainOneJobTime(task *models.Cloudbrain) {
  280. if task.TrainJobDuration == "" {
  281. if task.Duration == 0 {
  282. var duration int64
  283. if task.Status == string(models.JobWaiting) {
  284. duration = 0
  285. } else if task.Status == string(models.JobRunning) {
  286. duration = time.Now().Unix() - int64(task.CreatedUnix)
  287. } else {
  288. duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
  289. }
  290. task.Duration = duration
  291. }
  292. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  293. }
  294. }
  295. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  296. log.Info("The task not finished,name=" + job.DisplayJobName)
  297. jobResult, err := cloudbrain.GetJob(job.JobID)
  298. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  299. if err != nil {
  300. log.Error("ConvertToJobResultPayload failed:", err)
  301. return
  302. }
  303. job.Status = result.JobStatus.State
  304. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  305. taskRoles := result.TaskRoles
  306. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  307. job.Status = taskRes.TaskStatuses[0].State
  308. }
  309. updateCloudBrainOneJobTime(job)
  310. log.Info("cloud brain one job status=" + job.Status)
  311. if result.JobStatus.State != string(models.JobSucceeded) {
  312. err = models.UpdateJob(job)
  313. if err != nil {
  314. log.Error("UpdateJob failed:", err)
  315. }
  316. } else {
  317. //
  318. job.Status = string(models.ModelSafetyTesting)
  319. err = models.UpdateJob(job)
  320. if err != nil {
  321. log.Error("UpdateJob failed:", err)
  322. }
  323. //send msg to beihang
  324. sendGPUInferenceResultToTest(job)
  325. }
  326. }
  327. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  328. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  329. if err == nil {
  330. if result.Code == "0" {
  331. if result.Data.Status == 1 {
  332. log.Info("The task is running....")
  333. } else {
  334. if result.Data.Code == 0 {
  335. job.ResultJson = result.Data.StandardJson
  336. job.Status = string(models.JobSucceeded)
  337. err = models.UpdateJob(job)
  338. if err != nil {
  339. log.Error("UpdateJob failed:", err)
  340. }
  341. }
  342. }
  343. } else {
  344. log.Info("The task is failed.")
  345. job.Status = string(models.JobFailed)
  346. err = models.UpdateJob(job)
  347. if err != nil {
  348. log.Error("UpdateJob failed:", err)
  349. }
  350. }
  351. } else {
  352. log.Info("The task not found.....")
  353. }
  354. }
  355. func getAisafetyTaskReq(job *models.Cloudbrain) aisafety.TaskReq {
  356. datasetname := job.DatasetName
  357. datasetnames := strings.Split(datasetname, ";")
  358. indicator := job.LabelName
  359. EvalContent := "test1"
  360. if job.Description != "" {
  361. EvalContent = job.Description
  362. }
  363. req := aisafety.TaskReq{
  364. UnionId: job.JobID,
  365. EvalName: job.DisplayJobName,
  366. EvalContent: EvalContent,
  367. TLPath: "test1",
  368. Indicators: strings.Split(indicator, ";"),
  369. CDName: strings.Split(datasetnames[1], ".")[0],
  370. BDName: strings.Split(datasetnames[0], ".")[0] + "基础数据集",
  371. }
  372. log.Info("CDName=" + req.CDName)
  373. log.Info("BDName=" + req.BDName)
  374. return req
  375. }
  376. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  377. log.Info("send sendGPUInferenceResultToTest")
  378. req := getAisafetyTaskReq(job)
  379. resultDir := "/result"
  380. prefix := setting.CBCodePathPrefix + job.JobName + resultDir
  381. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  382. if err != nil {
  383. log.Error("query cloudbrain one model failed: %v", err)
  384. return
  385. }
  386. jsonContent := ""
  387. for _, file := range files {
  388. if strings.HasSuffix(file.FileName, "result.json") {
  389. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  390. log.Info("path=" + path)
  391. reader, err := os.Open(path)
  392. defer reader.Close()
  393. if err == nil {
  394. r := bufio.NewReader(reader)
  395. for {
  396. line, error := r.ReadString('\n')
  397. jsonContent += line
  398. if error == io.EOF {
  399. log.Info("read file completed.")
  400. break
  401. }
  402. if error != nil {
  403. log.Info("read file error." + error.Error())
  404. break
  405. }
  406. }
  407. }
  408. break
  409. }
  410. }
  411. if jsonContent != "" {
  412. sendHttpReqToBeihang(job, jsonContent, req)
  413. } else {
  414. updateJobFailed(job)
  415. }
  416. }
  417. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  418. log.Info("start to sendNPUInferenceResultToTest")
  419. req := getAisafetyTaskReq(job)
  420. jsonContent := ""
  421. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  422. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  423. resultPath = resultPath[1:]
  424. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  425. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  426. if err != nil {
  427. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  428. } else {
  429. defer body.Close()
  430. var data []byte
  431. p := make([]byte, 4096)
  432. var readErr error
  433. var readCount int
  434. for {
  435. readCount, readErr = body.Read(p)
  436. if readCount > 0 {
  437. data = append(data, p[:readCount]...)
  438. }
  439. if readErr != nil || readCount == 0 {
  440. break
  441. }
  442. }
  443. jsonContent = string(data)
  444. }
  445. if jsonContent != "" {
  446. sendHttpReqToBeihang(job, jsonContent, req)
  447. } else {
  448. updateJobFailed(job)
  449. }
  450. }
  451. func updateJobFailed(job *models.Cloudbrain) {
  452. log.Info("The json is null. so set it failed.")
  453. //update task failed.
  454. job.Status = string(models.ModelArtsTrainJobFailed)
  455. err := models.UpdateJob(job)
  456. if err != nil {
  457. log.Error("UpdateJob failed:", err)
  458. }
  459. }
  460. func sendHttpReqToBeihang(job *models.Cloudbrain, jsonContent string, req aisafety.TaskReq) {
  461. log.Info("start to send beihang ...")
  462. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  463. if err == nil {
  464. //update serial no to db
  465. job.PreVersionName = serialNo
  466. err = models.UpdateJob(job)
  467. if err != nil {
  468. log.Error("UpdateJob failed:", err)
  469. }
  470. }
  471. }
  472. func isTaskNotFinished(status string) bool {
  473. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  474. return true
  475. }
  476. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  477. return true
  478. }
  479. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  480. return true
  481. }
  482. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  483. return true
  484. }
  485. return false
  486. }
  487. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  488. t := time.Now()
  489. ctx.Data["PageIsCloudBrain"] = true
  490. ctx.Data["IsCreate"] = true
  491. ctx.Data["type"] = models.TypeCloudBrainOne
  492. ctx.Data["compute_resource"] = models.GPUResource
  493. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  494. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  495. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  496. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  497. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  498. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  499. ctx.Data["display_job_name"] = displayJobName
  500. prepareCloudbrainOneSpecs(ctx)
  501. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  502. if queuesDetail != nil {
  503. ctx.Data["QueuesDetail"] = queuesDetail
  504. }
  505. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  506. }
  507. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  508. t := time.Now()
  509. ctx.Data["PageIsCloudBrain"] = true
  510. ctx.Data["IsCreate"] = true
  511. ctx.Data["type"] = models.TypeCloudBrainTwo
  512. ctx.Data["compute_resource"] = models.NPUResource
  513. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  514. ctx.Data["display_job_name"] = displayJobName
  515. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  516. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  517. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  518. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  519. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  520. var resourcePools modelarts.ResourcePool
  521. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  522. ctx.ServerError("json.Unmarshal failed:", err)
  523. }
  524. ctx.Data["resource_pools"] = resourcePools.Info
  525. var engines modelarts.Engine
  526. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  527. ctx.ServerError("json.Unmarshal failed:", err)
  528. }
  529. ctx.Data["engines"] = engines.Info
  530. var versionInfos modelarts.VersionInfo
  531. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  532. ctx.ServerError("json.Unmarshal failed:", err)
  533. }
  534. ctx.Data["engine_versions"] = versionInfos.Version
  535. prepareCloudbrainTwoInferenceSpecs(ctx)
  536. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  537. ctx.Data["WaitCount"] = waitCount
  538. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  539. }
  540. func AiSafetyCreateForPost(ctx *context.Context) {
  541. ctx.Data["PageIsCloudBrain"] = true
  542. displayJobName := ctx.Query("display_job_name")
  543. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  544. taskType := ctx.QueryInt("type")
  545. description := ctx.Query("description")
  546. ctx.Data["type"] = taskType
  547. ctx.Data["displayJobName"] = displayJobName
  548. ctx.Data["description"] = description
  549. repo := ctx.Repo.Repository
  550. tpname := tplCloudBrainModelSafetyNewNpu
  551. if taskType == models.TypeCloudBrainOne {
  552. tpname = tplCloudBrainModelSafetyNewGpu
  553. }
  554. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  555. if err == nil {
  556. if len(tasks) != 0 {
  557. log.Error("the job name did already exist", ctx.Data["MsgID"])
  558. modelSafetyNewDataPrepare(ctx)
  559. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  560. return
  561. }
  562. } else {
  563. if !models.IsErrJobNotExist(err) {
  564. log.Error("system error, %v", err, ctx.Data["MsgID"])
  565. modelSafetyNewDataPrepare(ctx)
  566. ctx.RenderWithErr("system error", tpname, nil)
  567. return
  568. }
  569. }
  570. if !jobNamePattern.MatchString(jobName) {
  571. modelSafetyNewDataPrepare(ctx)
  572. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  573. return
  574. }
  575. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  576. if err != nil {
  577. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  578. modelSafetyNewDataPrepare(ctx)
  579. ctx.RenderWithErr("system error", tpname, nil)
  580. return
  581. } else {
  582. if count >= 1 {
  583. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  584. modelSafetyNewDataPrepare(ctx)
  585. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  586. return
  587. }
  588. }
  589. BootFile := ctx.Query("boot_file")
  590. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  591. if err != nil || !bootFileExist {
  592. log.Error("Get bootfile error:", err)
  593. modelSafetyNewDataPrepare(ctx)
  594. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  595. return
  596. }
  597. if taskType == models.TypeCloudBrainTwo {
  598. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  599. createForNPU(ctx, jobName)
  600. } else if taskType == models.TypeCloudBrainOne {
  601. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  602. createForGPU(ctx, jobName)
  603. }
  604. log.Info("to redirect...")
  605. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  606. }
  607. func createForNPU(ctx *context.Context, jobName string) {
  608. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  609. BootFile := ctx.Query("boot_file")
  610. displayJobName := ctx.Query("display_job_name")
  611. description := ctx.Query("description")
  612. srcDataset := ctx.Query("src_dataset") //uuid
  613. combatDataset := ctx.Query("combat_dataset") //uuid
  614. evaluationIndex := ctx.Query("evaluation_index")
  615. Params := ctx.Query("run_para_list")
  616. specId := ctx.QueryInt64("spec_id")
  617. engineID := ctx.QueryInt("engine_id")
  618. log.Info("engine_id=" + fmt.Sprint(engineID))
  619. poolID := ctx.Query("pool_id")
  620. repo := ctx.Repo.Repository
  621. trainUrl := ctx.Query("train_url")
  622. modelName := ctx.Query("model_name")
  623. modelVersion := ctx.Query("model_version")
  624. ckptName := ctx.Query("ckpt_name")
  625. ckptUrl := "/" + trainUrl + ckptName
  626. log.Info("ckpt url:" + ckptUrl)
  627. FlavorName := ctx.Query("flaver_names")
  628. EngineName := ctx.Query("engine_names")
  629. isLatestVersion := modelarts.IsLatestVersion
  630. VersionCount := modelarts.VersionCountOne
  631. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  632. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  633. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  634. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  635. log.Info("ckpt url:" + ckptUrl)
  636. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  637. JobType: models.JobTypeInference,
  638. ComputeResource: models.NPU,
  639. Cluster: models.OpenICluster,
  640. AiCenterCode: models.AICenterOfCloudBrainTwo})
  641. if err != nil || spec == nil {
  642. modelSafetyNewDataPrepare(ctx)
  643. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  644. return
  645. }
  646. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  647. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  648. modelSafetyNewDataPrepare(ctx)
  649. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  650. return
  651. }
  652. //todo: del the codeLocalPath
  653. _, err = ioutil.ReadDir(codeLocalPath)
  654. if err == nil {
  655. os.RemoveAll(codeLocalPath)
  656. }
  657. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  658. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  659. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  660. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  661. modelSafetyNewDataPrepare(ctx)
  662. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  663. return
  664. }
  665. //todo: upload code (send to file_server todo this work?)
  666. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  667. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  668. modelSafetyNewDataPrepare(ctx)
  669. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  670. return
  671. }
  672. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  673. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  674. modelSafetyNewDataPrepare(ctx)
  675. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  676. return
  677. }
  678. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  679. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  680. modelSafetyNewDataPrepare(ctx)
  681. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  682. return
  683. }
  684. var parameters models.Parameters
  685. param := make([]models.Parameter, 0)
  686. param = append(param, models.Parameter{
  687. Label: modelarts.ResultUrl,
  688. Value: "s3:/" + resultObsPath,
  689. }, models.Parameter{
  690. Label: modelarts.CkptUrl,
  691. Value: "s3:/" + ckptUrl,
  692. })
  693. uuid := srcDataset + ";" + combatDataset
  694. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  695. if err != nil {
  696. modelSafetyNewDataPrepare(ctx)
  697. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  698. return
  699. }
  700. dataPath := dataUrl
  701. jsondatas, err := json.Marshal(datasUrlList)
  702. if err != nil {
  703. log.Error("Failed to Marshal: %v", err)
  704. modelSafetyNewDataPrepare(ctx)
  705. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  706. return
  707. }
  708. if isMultiDataset {
  709. param = append(param, models.Parameter{
  710. Label: modelarts.MultiDataUrl,
  711. Value: string(jsondatas),
  712. })
  713. }
  714. existDeviceTarget := false
  715. if len(Params) != 0 {
  716. err := json.Unmarshal([]byte(Params), &parameters)
  717. if err != nil {
  718. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  719. modelSafetyNewDataPrepare(ctx)
  720. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  721. return
  722. }
  723. for _, parameter := range parameters.Parameter {
  724. if parameter.Label == modelarts.DeviceTarget {
  725. existDeviceTarget = true
  726. }
  727. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  728. param = append(param, models.Parameter{
  729. Label: parameter.Label,
  730. Value: parameter.Value,
  731. })
  732. }
  733. }
  734. }
  735. if !existDeviceTarget {
  736. param = append(param, models.Parameter{
  737. Label: modelarts.DeviceTarget,
  738. Value: modelarts.Ascend,
  739. })
  740. }
  741. req := &modelarts.GenerateInferenceJobReq{
  742. JobName: jobName,
  743. DisplayJobName: displayJobName,
  744. DataUrl: dataPath,
  745. Description: description,
  746. CodeObsPath: codeObsPath,
  747. BootFileUrl: codeObsPath + BootFile,
  748. BootFile: BootFile,
  749. TrainUrl: trainUrl,
  750. WorkServerNumber: 1,
  751. EngineID: int64(engineID),
  752. LogUrl: logObsPath,
  753. PoolID: poolID,
  754. Uuid: uuid,
  755. Parameters: param, //modelarts train parameters
  756. CommitID: commitID,
  757. BranchName: cloudbrain.DefaultBranchName,
  758. Params: Params,
  759. FlavorName: FlavorName,
  760. EngineName: EngineName,
  761. LabelName: evaluationIndex,
  762. IsLatestVersion: isLatestVersion,
  763. VersionCount: VersionCount,
  764. TotalVersionCount: modelarts.TotalVersionCount,
  765. ModelName: modelName,
  766. ModelVersion: modelVersion,
  767. CkptName: ckptName,
  768. ResultUrl: resultObsPath,
  769. Spec: spec,
  770. DatasetName: datasetNames,
  771. JobType: string(models.JobTypeModelSafety),
  772. }
  773. err = modelarts.GenerateInferenceJob(ctx, req)
  774. if err != nil {
  775. log.Error("GenerateTrainJob failed:%v", err.Error())
  776. modelSafetyNewDataPrepare(ctx)
  777. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  778. return
  779. }
  780. }
  781. func createForGPU(ctx *context.Context, jobName string) {
  782. BootFile := ctx.Query("boot_file")
  783. displayJobName := ctx.Query("display_job_name")
  784. description := ctx.Query("description")
  785. image := strings.TrimSpace(ctx.Query("image"))
  786. srcDataset := ctx.Query("src_dataset") //uuid
  787. combatDataset := ctx.Query("combat_dataset") //uuid
  788. evaluationIndex := ctx.Query("evaluation_index")
  789. Params := ctx.Query("run_para_list")
  790. specId := ctx.QueryInt64("spec_id")
  791. TrainUrl := ctx.Query("train_url")
  792. CkptName := ctx.Query("ckpt_name")
  793. modelName := ctx.Query("model_name")
  794. modelVersion := ctx.Query("model_version")
  795. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  796. log.Info("ckpt url:" + ckptUrl)
  797. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  798. JobType: models.JobTypeTrain,
  799. ComputeResource: models.GPU,
  800. Cluster: models.OpenICluster,
  801. AiCenterCode: models.AICenterOfCloudBrainOne})
  802. if err != nil || spec == nil {
  803. modelSafetyNewDataPrepare(ctx)
  804. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  805. return
  806. }
  807. repo := ctx.Repo.Repository
  808. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  809. os.RemoveAll(codePath)
  810. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  811. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  812. modelSafetyNewDataPrepare(ctx)
  813. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  814. return
  815. }
  816. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  817. if err != nil {
  818. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  819. modelSafetyNewDataPrepare(ctx)
  820. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  821. return
  822. }
  823. uuid := srcDataset + ";" + combatDataset
  824. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  825. log.Info("uuid=" + uuid)
  826. if err != nil {
  827. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  828. modelSafetyNewDataPrepare(ctx)
  829. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  830. return
  831. }
  832. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  833. if err != nil {
  834. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  835. modelSafetyNewDataPrepare(ctx)
  836. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  837. return
  838. }
  839. log.Info("Command=" + command)
  840. req := cloudbrain.GenerateCloudBrainTaskReq{
  841. Ctx: ctx,
  842. DisplayJobName: displayJobName,
  843. JobName: jobName,
  844. Image: image,
  845. Command: command,
  846. Uuids: uuid,
  847. DatasetNames: datasetNames,
  848. DatasetInfos: datasetInfos,
  849. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  850. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  851. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  852. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  853. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  854. JobType: string(models.JobTypeModelSafety),
  855. Description: description,
  856. BranchName: cloudbrain.DefaultBranchName,
  857. BootFile: BootFile,
  858. Params: Params,
  859. CommitID: "",
  860. ModelName: modelName,
  861. ModelVersion: modelVersion,
  862. CkptName: CkptName,
  863. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  864. Spec: spec,
  865. LabelName: evaluationIndex,
  866. }
  867. err = cloudbrain.GenerateTask(req)
  868. if err != nil {
  869. modelSafetyNewDataPrepare(ctx)
  870. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  871. return
  872. }
  873. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  874. }
  875. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  876. var command string
  877. bootFile := strings.TrimSpace(BootFile)
  878. if !strings.HasSuffix(bootFile, ".py") {
  879. log.Error("bootFile(%s) format error", bootFile)
  880. return command, errors.New("bootFile format error")
  881. }
  882. var parameters models.Parameters
  883. var param string
  884. if len(params) != 0 {
  885. err := json.Unmarshal([]byte(params), &parameters)
  886. if err != nil {
  887. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  888. return command, err
  889. }
  890. for _, parameter := range parameters.Parameter {
  891. param += " --" + parameter.Label + "=" + parameter.Value
  892. }
  893. }
  894. param += " --ckpt_url=" + cloudbrain.ModelMountPath + "/" + CkptName
  895. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  896. return command, nil
  897. }
  898. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  899. ctx.Data["PageIsCloudBrain"] = true
  900. ctx.Data["type"] = ctx.QueryInt("type")
  901. ctx.Data["boot_file"] = ctx.Query("boot_file")
  902. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  903. ctx.Data["description"] = ctx.Query("description")
  904. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  905. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  906. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  907. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  908. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  909. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  910. ctx.Data["train_url"] = ctx.Query("train_url")
  911. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  912. ctx.Data["train_url"] = ctx.Query("train_url")
  913. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  914. ctx.Data["model_name"] = ctx.Query("model_name")
  915. ctx.Data["model_version"] = ctx.Query("model_version")
  916. if ctx.QueryInt("type") == models.TypeCloudBrainOne {
  917. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  918. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  919. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  920. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  921. } else {
  922. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  923. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  924. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  925. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  926. }
  927. prepareCloudbrainOneSpecs(ctx)
  928. prepareCloudbrainTwoInferenceSpecs(ctx)
  929. return nil
  930. }
  931. func getJsonContent(url string) (string, error) {
  932. resp, err := http.Get(url)
  933. if err != nil || resp.StatusCode != 200 {
  934. log.Info("Get organizations url error=" + err.Error())
  935. return "", err
  936. }
  937. bytes, err := ioutil.ReadAll(resp.Body)
  938. resp.Body.Close()
  939. if err != nil {
  940. log.Info("Get organizations url error=" + err.Error())
  941. return "", err
  942. }
  943. str := string(bytes)
  944. //log.Info("json str =" + str)
  945. return str, nil
  946. }