You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/modelarts"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/util"
  25. "code.gitea.io/gitea/services/cloudbrain/resource"
  26. "code.gitea.io/gitea/services/reward/point/account"
  27. uuid "github.com/satori/go.uuid"
  28. )
  29. const (
  30. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  31. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  32. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  33. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  34. tplModelSafetyTestShow = "repo/modelsafety/show"
  35. )
  36. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  37. log.Info("start to create CloudBrainAiSafetyCreate")
  38. uuid := uuid.NewV4()
  39. id := uuid.String()
  40. seriaNoParas := ctx.Query("serialNo")
  41. fileName := ctx.Query("fileName")
  42. //if jobType == string(models.JobTypeBenchmark) {
  43. req := aisafety.TaskReq{
  44. UnionId: id,
  45. EvalName: "test1",
  46. EvalContent: "test1",
  47. TLPath: "test1",
  48. Indicators: []string{"ACC", "ASS"},
  49. CDName: "CIFAR10_1000_FGSM",
  50. BDName: "CIFAR10_1000基础数据集",
  51. }
  52. aisafety.GetAlgorithmList()
  53. if seriaNoParas != "" {
  54. aisafety.GetTaskStatus(seriaNoParas)
  55. } else {
  56. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  57. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  58. if err == nil {
  59. log.Info("serialNo=" + serialNo)
  60. time.Sleep(time.Duration(2) * time.Second)
  61. aisafety.GetTaskStatus(serialNo)
  62. } else {
  63. log.Info("CreateSafetyTask error," + err.Error())
  64. }
  65. }
  66. }
  67. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  68. if job == nil {
  69. log.Error("GetCloudbrainByJobID failed")
  70. return
  71. }
  72. syncAiSafetyTaskStatus(job)
  73. }
  74. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  75. ctx.Data["id"] = ctx.Params(":jobid")
  76. ctx.Data["PageIsCloudBrain"] = true
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. job.CanModify = cloudbrain.CanModifyJob(ctx, job)
  91. job.CanDel = cloudbrain.CanDeleteJob(ctx, job)
  92. if job.Parameters == "{\"parameter\":[]}" {
  93. job.Parameters = ""
  94. }
  95. s, err := resource.GetCloudbrainSpec(job.ID)
  96. if err == nil {
  97. job.Spec = s
  98. }
  99. user, err := models.GetUserByID(job.UserID)
  100. if err == nil {
  101. tmpUser := &models.User{
  102. Name: user.Name,
  103. }
  104. job.User = tmpUser
  105. }
  106. ctx.JSON(200, job)
  107. }
  108. func StopAiSafetyTask(ctx *context.Context) {
  109. log.Info("start to stop the task.")
  110. var ID = ctx.Params(":jobid")
  111. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  112. result := make(map[string]interface{})
  113. result["result_code"] = "-1"
  114. if err != nil {
  115. log.Info("query task error.err=" + err.Error())
  116. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  117. result["msg"] = "No such task."
  118. ctx.JSON(200, result)
  119. return
  120. }
  121. if isTaskNotFinished(task.Status) {
  122. if task.Type == models.TypeCloudBrainTwo {
  123. log.Info("start to stop model arts task.")
  124. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  125. if err != nil {
  126. log.Info("stop failed.err=" + err.Error())
  127. }
  128. task.Status = string(models.JobStopped)
  129. if task.EndTime == 0 {
  130. task.EndTime = timeutil.TimeStampNow()
  131. }
  132. task.ComputeAndSetDuration()
  133. err = models.UpdateJob(task)
  134. if err != nil {
  135. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  136. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  137. ctx.JSON(200, result)
  138. return
  139. }
  140. //queryTaskStatusFromCloudbrainTwo(job)
  141. } else if task.Type == models.TypeCloudBrainOne {
  142. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  143. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  144. result["msg"] = "cloudbrain.Already_stopped"
  145. ctx.JSON(200, result)
  146. return
  147. }
  148. err := cloudbrain.StopJob(task.JobID)
  149. if err != nil {
  150. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  151. result["msg"] = "cloudbrain.Stopped_failed"
  152. ctx.JSON(200, result)
  153. return
  154. }
  155. task.Status = string(models.JobStopped)
  156. if task.EndTime == 0 {
  157. task.EndTime = timeutil.TimeStampNow()
  158. }
  159. task.ComputeAndSetDuration()
  160. err = models.UpdateJob(task)
  161. if err != nil {
  162. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  163. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  164. ctx.JSON(200, result)
  165. return
  166. }
  167. }
  168. } else {
  169. if task.Status == string(models.ModelSafetyTesting) {
  170. //修改为Failed
  171. task.Status = string(models.JobStopped)
  172. if task.EndTime == 0 {
  173. task.EndTime = timeutil.TimeStampNow()
  174. }
  175. task.ComputeAndSetDuration()
  176. err = models.UpdateJob(task)
  177. if err != nil {
  178. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  179. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  180. ctx.JSON(200, result)
  181. return
  182. }
  183. } else {
  184. log.Info("The job is finished. status=" + task.Status)
  185. }
  186. }
  187. result["result_code"] = "0"
  188. result["msg"] = "succeed"
  189. ctx.JSON(200, result)
  190. }
  191. func DelAiSafetyTask(ctx *context.Context) {
  192. var ID = ctx.Params(":jobid")
  193. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  194. if err != nil {
  195. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  196. ctx.ServerError("No such task.", err)
  197. return
  198. }
  199. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  200. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  201. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  202. return
  203. }
  204. if task.Type == models.TypeCloudBrainOne {
  205. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  206. }
  207. err = models.DeleteJob(task)
  208. if err != nil {
  209. ctx.ServerError(err.Error(), err)
  210. return
  211. }
  212. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  213. }
  214. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  215. log.Info("start to query safety task status.")
  216. if isTaskNotFinished(job.Status) {
  217. if job.Type == models.TypeCloudBrainTwo {
  218. queryTaskStatusFromCloudbrainTwo(job)
  219. } else if job.Type == models.TypeCloudBrainOne {
  220. queryTaskStatusFromCloudbrain(job)
  221. }
  222. } else {
  223. if job.Status == string(models.ModelSafetyTesting) {
  224. queryTaskStatusFromModelSafetyTestServer(job)
  225. } else {
  226. log.Info("The job is finished. status=" + job.Status)
  227. }
  228. }
  229. }
  230. func TimerHandleModelSafetyTestTask() {
  231. log.Info("start to TimerHandleModelSafetyTestTask")
  232. tasks, err := models.GetModelSafetyTestTask()
  233. if err == nil {
  234. if tasks != nil && len(tasks) > 0 {
  235. for _, job := range tasks {
  236. syncAiSafetyTaskStatus(job)
  237. }
  238. } else {
  239. log.Info("query running model safety test task 0.")
  240. }
  241. } else {
  242. log.Info("query running model safety test task err." + err.Error())
  243. }
  244. }
  245. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  246. log.Info("The task not finished,name=" + job.DisplayJobName)
  247. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  248. if err != nil {
  249. log.Info("query train job error." + err.Error())
  250. return
  251. }
  252. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  253. job.Duration = result.Duration / 1000
  254. job.TrainJobDuration = result.TrainJobDuration
  255. if job.StartTime == 0 && result.StartTime > 0 {
  256. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  257. }
  258. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  259. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  260. job.EndTime = job.StartTime.Add(job.Duration)
  261. }
  262. job.CorrectCreateUnix()
  263. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  264. log.Info("CloudbrainTwo task status=" + job.Status)
  265. err = models.UpdateJob(job)
  266. if err != nil {
  267. log.Error("UpdateJob failed:", err)
  268. }
  269. } else {
  270. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  271. job.Status = string(models.ModelSafetyTesting)
  272. err = models.UpdateJob(job)
  273. if err != nil {
  274. log.Error("UpdateJob failed:", err)
  275. }
  276. //send msg to beihang
  277. sendNPUInferenceResultToTest(job)
  278. }
  279. }
  280. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  281. log.Info("The task not finished,name=" + job.DisplayJobName)
  282. jobResult, err := cloudbrain.GetJob(job.JobID)
  283. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  284. if err != nil {
  285. log.Error("ConvertToJobResultPayload failed:", err)
  286. return
  287. }
  288. job.Status = result.JobStatus.State
  289. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  290. taskRoles := result.TaskRoles
  291. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  292. job.Status = taskRes.TaskStatuses[0].State
  293. }
  294. models.ParseAndSetDurationFromCloudBrainOne(result, job)
  295. //updateCloudBrainOneJobTime(job)
  296. log.Info("cloud brain one job status=" + job.Status)
  297. if result.JobStatus.State != string(models.JobSucceeded) {
  298. err = models.UpdateJob(job)
  299. if err != nil {
  300. log.Error("UpdateJob failed:", err)
  301. }
  302. } else {
  303. //
  304. job.Status = string(models.ModelSafetyTesting)
  305. err = models.UpdateJob(job)
  306. if err != nil {
  307. log.Error("UpdateJob failed:", err)
  308. }
  309. //send msg to beihang
  310. sendGPUInferenceResultToTest(job)
  311. }
  312. }
  313. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  314. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  315. if err == nil {
  316. if result.Code == "0" {
  317. if result.Data.Status == 1 {
  318. log.Info("The task is running....")
  319. } else {
  320. if result.Data.Code == 0 {
  321. job.ResultJson = result.Data.StandardJson
  322. job.Status = string(models.JobSucceeded)
  323. err = models.UpdateJob(job)
  324. if err != nil {
  325. log.Error("UpdateJob failed:", err)
  326. }
  327. } else {
  328. job.ResultJson = result.Data.Msg
  329. job.Status = string(models.JobFailed)
  330. err = models.UpdateJob(job)
  331. if err != nil {
  332. log.Error("UpdateJob failed:", err)
  333. }
  334. }
  335. }
  336. } else {
  337. log.Info("The task is failed.")
  338. job.Status = string(models.JobFailed)
  339. err = models.UpdateJob(job)
  340. if err != nil {
  341. log.Error("UpdateJob failed:", err)
  342. }
  343. }
  344. } else {
  345. log.Info("The task not found.....")
  346. }
  347. }
  348. func getAisafetyTaskReq(job *models.Cloudbrain) aisafety.TaskReq {
  349. datasetname := job.DatasetName
  350. datasetnames := strings.Split(datasetname, ";")
  351. indicator := job.LabelName
  352. EvalContent := "test1"
  353. if job.Description != "" {
  354. EvalContent = job.Description
  355. }
  356. req := aisafety.TaskReq{
  357. UnionId: job.JobID,
  358. EvalName: job.DisplayJobName,
  359. EvalContent: EvalContent,
  360. TLPath: "test1",
  361. Indicators: strings.Split(indicator, ";"),
  362. CDName: strings.Split(datasetnames[1], ".")[0],
  363. BDName: strings.Split(datasetnames[0], ".")[0] + "基础数据集",
  364. }
  365. log.Info("CDName=" + req.CDName)
  366. log.Info("BDName=" + req.BDName)
  367. return req
  368. }
  369. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  370. log.Info("send sendGPUInferenceResultToTest")
  371. req := getAisafetyTaskReq(job)
  372. resultDir := "/result"
  373. prefix := setting.CBCodePathPrefix + job.JobName + resultDir
  374. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  375. if err != nil {
  376. log.Error("query cloudbrain one model failed: %v", err)
  377. return
  378. }
  379. jsonContent := ""
  380. for _, file := range files {
  381. if strings.HasSuffix(file.FileName, "result.json") {
  382. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  383. log.Info("path=" + path)
  384. reader, err := os.Open(path)
  385. defer reader.Close()
  386. if err == nil {
  387. r := bufio.NewReader(reader)
  388. for {
  389. line, error := r.ReadString('\n')
  390. jsonContent += line
  391. if error == io.EOF {
  392. log.Info("read file completed.")
  393. break
  394. }
  395. if error != nil {
  396. log.Info("read file error." + error.Error())
  397. break
  398. }
  399. }
  400. }
  401. break
  402. }
  403. }
  404. if jsonContent != "" {
  405. sendHttpReqToBeihang(job, jsonContent, req)
  406. } else {
  407. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  408. }
  409. }
  410. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  411. log.Info("start to sendNPUInferenceResultToTest")
  412. req := getAisafetyTaskReq(job)
  413. jsonContent := ""
  414. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  415. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  416. resultPath = resultPath[1:]
  417. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  418. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  419. if err != nil {
  420. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  421. } else {
  422. defer body.Close()
  423. var data []byte
  424. p := make([]byte, 4096)
  425. var readErr error
  426. var readCount int
  427. for {
  428. readCount, readErr = body.Read(p)
  429. if readCount > 0 {
  430. data = append(data, p[:readCount]...)
  431. }
  432. if readErr != nil || readCount == 0 {
  433. break
  434. }
  435. }
  436. jsonContent = string(data)
  437. }
  438. if jsonContent != "" {
  439. sendHttpReqToBeihang(job, jsonContent, req)
  440. } else {
  441. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  442. }
  443. }
  444. func updateJobFailed(job *models.Cloudbrain, msg string) {
  445. log.Info("The json is null. so set it failed.")
  446. //update task failed.
  447. job.Status = string(models.ModelArtsTrainJobFailed)
  448. job.ResultJson = msg
  449. err := models.UpdateJob(job)
  450. if err != nil {
  451. log.Error("UpdateJob failed:", err)
  452. }
  453. }
  454. func sendHttpReqToBeihang(job *models.Cloudbrain, jsonContent string, req aisafety.TaskReq) {
  455. log.Info("start to send beihang ...")
  456. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  457. if err == nil {
  458. //update serial no to db
  459. job.PreVersionName = serialNo
  460. err = models.UpdateJob(job)
  461. if err != nil {
  462. log.Error("UpdateJob failed:", err)
  463. }
  464. }
  465. }
  466. func isTaskNotFinished(status string) bool {
  467. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  468. return true
  469. }
  470. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  471. return true
  472. }
  473. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  474. return true
  475. }
  476. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  477. return true
  478. }
  479. return false
  480. }
  481. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  482. t := time.Now()
  483. ctx.Data["PageIsCloudBrain"] = true
  484. ctx.Data["IsCreate"] = true
  485. ctx.Data["type"] = models.TypeCloudBrainOne
  486. ctx.Data["compute_resource"] = models.GPUResource
  487. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  488. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  489. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  490. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  491. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  492. log.Info("GPUBaseDataSetName=" + setting.ModelSafetyTest.GPUBaseDataSetName)
  493. log.Info("GPUBaseDataSetUUID=" + setting.ModelSafetyTest.GPUBaseDataSetUUID)
  494. log.Info("GPUCombatDataSetName=" + setting.ModelSafetyTest.GPUCombatDataSetName)
  495. log.Info("GPUCombatDataSetUUID=" + setting.ModelSafetyTest.GPUCombatDataSetUUID)
  496. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  497. ctx.Data["display_job_name"] = displayJobName
  498. prepareCloudbrainOneSpecs(ctx)
  499. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  500. if queuesDetail != nil {
  501. ctx.Data["QueuesDetail"] = queuesDetail
  502. }
  503. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  504. }
  505. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  506. t := time.Now()
  507. ctx.Data["PageIsCloudBrain"] = true
  508. ctx.Data["IsCreate"] = true
  509. ctx.Data["type"] = models.TypeCloudBrainTwo
  510. ctx.Data["compute_resource"] = models.NPUResource
  511. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  512. ctx.Data["display_job_name"] = displayJobName
  513. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  514. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  515. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  516. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  517. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  518. log.Info("NPUBaseDataSetName=" + setting.ModelSafetyTest.NPUBaseDataSetName)
  519. log.Info("NPUBaseDataSetUUID=" + setting.ModelSafetyTest.NPUBaseDataSetUUID)
  520. log.Info("NPUCombatDataSetName=" + setting.ModelSafetyTest.NPUCombatDataSetName)
  521. log.Info("NPUCombatDataSetUUID=" + setting.ModelSafetyTest.NPUCombatDataSetUUID)
  522. var resourcePools modelarts.ResourcePool
  523. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  524. ctx.ServerError("json.Unmarshal failed:", err)
  525. }
  526. ctx.Data["resource_pools"] = resourcePools.Info
  527. var engines modelarts.Engine
  528. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  529. ctx.ServerError("json.Unmarshal failed:", err)
  530. }
  531. ctx.Data["engines"] = engines.Info
  532. var versionInfos modelarts.VersionInfo
  533. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  534. ctx.ServerError("json.Unmarshal failed:", err)
  535. }
  536. ctx.Data["engine_versions"] = versionInfos.Version
  537. prepareCloudbrainTwoInferenceSpecs(ctx)
  538. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  539. ctx.Data["WaitCount"] = waitCount
  540. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  541. }
  542. func AiSafetyCreateForPost(ctx *context.Context) {
  543. ctx.Data["PageIsCloudBrain"] = true
  544. displayJobName := ctx.Query("display_job_name")
  545. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  546. taskType := ctx.QueryInt("type")
  547. description := ctx.Query("description")
  548. ctx.Data["type"] = taskType
  549. ctx.Data["displayJobName"] = displayJobName
  550. ctx.Data["description"] = description
  551. repo := ctx.Repo.Repository
  552. tpname := tplCloudBrainModelSafetyNewNpu
  553. if taskType == models.TypeCloudBrainOne {
  554. tpname = tplCloudBrainModelSafetyNewGpu
  555. }
  556. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  557. if err == nil {
  558. if len(tasks) != 0 {
  559. log.Error("the job name did already exist", ctx.Data["MsgID"])
  560. modelSafetyNewDataPrepare(ctx)
  561. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  562. return
  563. }
  564. } else {
  565. if !models.IsErrJobNotExist(err) {
  566. log.Error("system error, %v", err, ctx.Data["MsgID"])
  567. modelSafetyNewDataPrepare(ctx)
  568. ctx.RenderWithErr("system error", tpname, nil)
  569. return
  570. }
  571. }
  572. if !jobNamePattern.MatchString(jobName) {
  573. modelSafetyNewDataPrepare(ctx)
  574. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  575. return
  576. }
  577. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  578. if err != nil {
  579. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  580. modelSafetyNewDataPrepare(ctx)
  581. ctx.RenderWithErr("system error", tpname, nil)
  582. return
  583. } else {
  584. if count >= 1 {
  585. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  586. modelSafetyNewDataPrepare(ctx)
  587. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  588. return
  589. }
  590. }
  591. BootFile := ctx.Query("boot_file")
  592. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  593. if err != nil || !bootFileExist {
  594. log.Error("Get bootfile error:", err)
  595. modelSafetyNewDataPrepare(ctx)
  596. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  597. return
  598. }
  599. if taskType == models.TypeCloudBrainTwo {
  600. err = createForNPU(ctx, jobName)
  601. } else if taskType == models.TypeCloudBrainOne {
  602. err = createForGPU(ctx, jobName)
  603. }
  604. if err != nil {
  605. modelSafetyNewDataPrepare(ctx)
  606. ctx.RenderWithErr(err.Error(), tpname, nil)
  607. } else {
  608. log.Info("to redirect...")
  609. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  610. }
  611. }
  612. func createForNPU(ctx *context.Context, jobName string) error {
  613. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  614. BootFile := ctx.Query("boot_file")
  615. displayJobName := ctx.Query("display_job_name")
  616. description := ctx.Query("description")
  617. srcDataset := ctx.Query("src_dataset") //uuid
  618. combatDataset := ctx.Query("combat_dataset") //uuid
  619. evaluationIndex := ctx.Query("evaluation_index")
  620. Params := ctx.Query("run_para_list")
  621. specId := ctx.QueryInt64("spec_id")
  622. engineID := ctx.QueryInt("engine_id")
  623. log.Info("engine_id=" + fmt.Sprint(engineID))
  624. poolID := ctx.Query("pool_id")
  625. repo := ctx.Repo.Repository
  626. trainUrl := ctx.Query("train_url")
  627. modelName := ctx.Query("model_name")
  628. modelVersion := ctx.Query("model_version")
  629. ckptName := ctx.Query("ckpt_name")
  630. ckptUrl := "/" + trainUrl + ckptName
  631. log.Info("ckpt url:" + ckptUrl)
  632. FlavorName := ctx.Query("flaver_names")
  633. EngineName := ctx.Query("engine_names")
  634. isLatestVersion := modelarts.IsLatestVersion
  635. VersionCount := modelarts.VersionCountOne
  636. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  637. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  638. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  639. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  640. log.Info("ckpt url:" + ckptUrl)
  641. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  642. JobType: models.JobTypeInference,
  643. ComputeResource: models.NPU,
  644. Cluster: models.OpenICluster,
  645. AiCenterCode: models.AICenterOfCloudBrainTwo})
  646. if err != nil || spec == nil {
  647. //ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  648. return errors.New("Resource specification not available")
  649. }
  650. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  651. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  652. return errors.New(ctx.Tr("points.insufficient_points_balance"))
  653. }
  654. //todo: del the codeLocalPath
  655. _, err = ioutil.ReadDir(codeLocalPath)
  656. if err == nil {
  657. os.RemoveAll(codeLocalPath)
  658. }
  659. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  660. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  661. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  662. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  663. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  664. }
  665. //todo: upload code (send to file_server todo this work?)
  666. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  667. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  668. return errors.New("Failed to obsMkdir_result")
  669. }
  670. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  671. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  672. return errors.New("Failed to obsMkdir_log")
  673. }
  674. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  675. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  676. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  677. }
  678. var parameters models.Parameters
  679. param := make([]models.Parameter, 0)
  680. param = append(param, models.Parameter{
  681. Label: modelarts.ResultUrl,
  682. Value: "s3:/" + resultObsPath,
  683. }, models.Parameter{
  684. Label: modelarts.CkptUrl,
  685. Value: "s3:/" + ckptUrl,
  686. })
  687. uuid := srcDataset + ";" + combatDataset
  688. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  689. if err != nil {
  690. return err
  691. }
  692. dataPath := dataUrl
  693. jsondatas, err := json.Marshal(datasUrlList)
  694. if err != nil {
  695. log.Error("Failed to Marshal: %v", err)
  696. return err
  697. }
  698. if isMultiDataset {
  699. param = append(param, models.Parameter{
  700. Label: modelarts.MultiDataUrl,
  701. Value: string(jsondatas),
  702. })
  703. }
  704. existDeviceTarget := false
  705. if len(Params) != 0 {
  706. err := json.Unmarshal([]byte(Params), &parameters)
  707. if err != nil {
  708. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  709. return errors.New("运行参数错误")
  710. }
  711. for _, parameter := range parameters.Parameter {
  712. if parameter.Label == modelarts.DeviceTarget {
  713. existDeviceTarget = true
  714. }
  715. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  716. param = append(param, models.Parameter{
  717. Label: parameter.Label,
  718. Value: parameter.Value,
  719. })
  720. }
  721. }
  722. }
  723. if !existDeviceTarget {
  724. param = append(param, models.Parameter{
  725. Label: modelarts.DeviceTarget,
  726. Value: modelarts.Ascend,
  727. })
  728. }
  729. req := &modelarts.GenerateInferenceJobReq{
  730. JobName: jobName,
  731. DisplayJobName: displayJobName,
  732. DataUrl: dataPath,
  733. Description: description,
  734. CodeObsPath: codeObsPath,
  735. BootFileUrl: codeObsPath + BootFile,
  736. BootFile: BootFile,
  737. TrainUrl: trainUrl,
  738. WorkServerNumber: 1,
  739. EngineID: int64(engineID),
  740. LogUrl: logObsPath,
  741. PoolID: poolID,
  742. Uuid: uuid,
  743. Parameters: param, //modelarts train parameters
  744. CommitID: commitID,
  745. BranchName: cloudbrain.DefaultBranchName,
  746. Params: Params,
  747. FlavorName: FlavorName,
  748. EngineName: EngineName,
  749. LabelName: evaluationIndex,
  750. IsLatestVersion: isLatestVersion,
  751. VersionCount: VersionCount,
  752. TotalVersionCount: modelarts.TotalVersionCount,
  753. ModelName: modelName,
  754. ModelVersion: modelVersion,
  755. CkptName: ckptName,
  756. ResultUrl: resultObsPath,
  757. Spec: spec,
  758. DatasetName: datasetNames,
  759. JobType: string(models.JobTypeModelSafety),
  760. }
  761. err = modelarts.GenerateInferenceJob(ctx, req)
  762. if err != nil {
  763. log.Error("GenerateTrainJob failed:%v", err.Error())
  764. return err
  765. }
  766. return nil
  767. }
  768. func createForGPU(ctx *context.Context, jobName string) error {
  769. BootFile := ctx.Query("boot_file")
  770. displayJobName := ctx.Query("display_job_name")
  771. description := ctx.Query("description")
  772. image := strings.TrimSpace(ctx.Query("image"))
  773. srcDataset := ctx.Query("src_dataset") //uuid
  774. combatDataset := ctx.Query("combat_dataset") //uuid
  775. evaluationIndex := ctx.Query("evaluation_index")
  776. Params := ctx.Query("run_para_list")
  777. specId := ctx.QueryInt64("spec_id")
  778. TrainUrl := ctx.Query("train_url")
  779. CkptName := ctx.Query("ckpt_name")
  780. modelName := ctx.Query("model_name")
  781. modelVersion := ctx.Query("model_version")
  782. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  783. log.Info("ckpt url:" + ckptUrl)
  784. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  785. JobType: models.JobTypeBenchmark,
  786. ComputeResource: models.GPU,
  787. Cluster: models.OpenICluster,
  788. AiCenterCode: models.AICenterOfCloudBrainOne})
  789. if err != nil || spec == nil {
  790. return errors.New("Resource specification not available")
  791. }
  792. repo := ctx.Repo.Repository
  793. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  794. os.RemoveAll(codePath)
  795. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  796. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  797. return errors.New("system error")
  798. }
  799. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  800. if err != nil {
  801. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  802. return errors.New("system error")
  803. }
  804. uuid := srcDataset + ";" + combatDataset
  805. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  806. log.Info("uuid=" + uuid)
  807. if err != nil {
  808. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  809. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  810. }
  811. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  812. if err != nil {
  813. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  814. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  815. }
  816. log.Info("Command=" + command)
  817. req := cloudbrain.GenerateCloudBrainTaskReq{
  818. Ctx: ctx,
  819. DisplayJobName: displayJobName,
  820. JobName: jobName,
  821. Image: image,
  822. Command: command,
  823. Uuids: uuid,
  824. DatasetNames: datasetNames,
  825. DatasetInfos: datasetInfos,
  826. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  827. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  828. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  829. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  830. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  831. JobType: string(models.JobTypeModelSafety),
  832. Description: description,
  833. BranchName: cloudbrain.DefaultBranchName,
  834. BootFile: BootFile,
  835. Params: Params,
  836. CommitID: "",
  837. ModelName: modelName,
  838. ModelVersion: modelVersion,
  839. CkptName: CkptName,
  840. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  841. Spec: spec,
  842. LabelName: evaluationIndex,
  843. }
  844. err = cloudbrain.GenerateTask(req)
  845. if err != nil {
  846. return err
  847. }
  848. return nil
  849. }
  850. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  851. var command string
  852. bootFile := strings.TrimSpace(BootFile)
  853. if !strings.HasSuffix(bootFile, ".py") {
  854. log.Error("bootFile(%s) format error", bootFile)
  855. return command, errors.New("bootFile format error")
  856. }
  857. var parameters models.Parameters
  858. var param string
  859. if len(params) != 0 {
  860. err := json.Unmarshal([]byte(params), &parameters)
  861. if err != nil {
  862. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  863. return command, err
  864. }
  865. for _, parameter := range parameters.Parameter {
  866. param += " --" + parameter.Label + "=" + parameter.Value
  867. }
  868. }
  869. param += " --ckpt_url=" + cloudbrain.ModelMountPath + "/" + CkptName
  870. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  871. return command, nil
  872. }
  873. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  874. ctx.Data["PageIsCloudBrain"] = true
  875. ctx.Data["type"] = ctx.QueryInt("type")
  876. ctx.Data["boot_file"] = ctx.Query("boot_file")
  877. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  878. ctx.Data["description"] = ctx.Query("description")
  879. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  880. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  881. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  882. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  883. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  884. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  885. ctx.Data["train_url"] = ctx.Query("train_url")
  886. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  887. ctx.Data["train_url"] = ctx.Query("train_url")
  888. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  889. ctx.Data["model_name"] = ctx.Query("model_name")
  890. ctx.Data["model_version"] = ctx.Query("model_version")
  891. if ctx.QueryInt("type") == models.TypeCloudBrainOne {
  892. ctx.Data["type"] = models.TypeCloudBrainOne
  893. ctx.Data["compute_resource"] = models.GPUResource
  894. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  895. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  896. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  897. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  898. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  899. prepareCloudbrainOneSpecs(ctx)
  900. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  901. if queuesDetail != nil {
  902. ctx.Data["QueuesDetail"] = queuesDetail
  903. }
  904. } else {
  905. ctx.Data["engine_id"] = ctx.QueryInt("engine_id")
  906. ctx.Data["pool_id"] = ctx.Query("pool_id")
  907. ctx.Data["type"] = models.TypeCloudBrainTwo
  908. ctx.Data["compute_resource"] = models.NPUResource
  909. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  910. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  911. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  912. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  913. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  914. var engines modelarts.Engine
  915. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  916. ctx.ServerError("json.Unmarshal failed:", err)
  917. }
  918. ctx.Data["engines"] = engines.Info
  919. var versionInfos modelarts.VersionInfo
  920. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  921. ctx.ServerError("json.Unmarshal failed:", err)
  922. }
  923. ctx.Data["engine_versions"] = versionInfos.Version
  924. prepareCloudbrainTwoInferenceSpecs(ctx)
  925. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  926. ctx.Data["WaitCount"] = waitCount
  927. }
  928. return nil
  929. }
  930. func getJsonContent(url string) (string, error) {
  931. resp, err := http.Get(url)
  932. if err != nil || resp.StatusCode != 200 {
  933. log.Info("Get organizations url error=" + err.Error())
  934. return "", err
  935. }
  936. bytes, err := ioutil.ReadAll(resp.Body)
  937. resp.Body.Close()
  938. if err != nil {
  939. log.Info("Get organizations url error=" + err.Error())
  940. return "", err
  941. }
  942. str := string(bytes)
  943. //log.Info("json str =" + str)
  944. return str, nil
  945. }