You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 33 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/modelarts"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/util"
  25. "code.gitea.io/gitea/services/cloudbrain/resource"
  26. "code.gitea.io/gitea/services/reward/point/account"
  27. uuid "github.com/satori/go.uuid"
  28. )
  29. const (
  30. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  31. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  32. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  33. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  34. tplModelSafetyTestShow = "repo/modelsafety/show"
  35. )
  36. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  37. log.Info("start to create CloudBrainAiSafetyCreate")
  38. uuid := uuid.NewV4()
  39. id := uuid.String()
  40. seriaNoParas := ctx.Query("serialNo")
  41. fileName := ctx.Query("fileName")
  42. //if jobType == string(models.JobTypeBenchmark) {
  43. req := aisafety.TaskReq{
  44. UnionId: id,
  45. EvalName: "test1",
  46. EvalContent: "test1",
  47. TLPath: "test1",
  48. Indicators: []string{"ACC", "ASS"},
  49. CDName: "CIFAR10_1000_FGSM",
  50. BDName: "CIFAR10_1000基础数据集",
  51. }
  52. aisafety.GetAlgorithmList()
  53. if seriaNoParas != "" {
  54. aisafety.GetTaskStatus(seriaNoParas)
  55. } else {
  56. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  57. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  58. if err == nil {
  59. log.Info("serialNo=" + serialNo)
  60. time.Sleep(time.Duration(2) * time.Second)
  61. aisafety.GetTaskStatus(serialNo)
  62. } else {
  63. log.Info("CreateSafetyTask error," + err.Error())
  64. }
  65. }
  66. }
  67. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  68. if job == nil {
  69. log.Error("GetCloudbrainByJobID failed")
  70. return
  71. }
  72. syncAiSafetyTaskStatus(job)
  73. }
  74. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  75. ctx.Data["id"] = ctx.Params(":jobid")
  76. ctx.Data["PageIsCloudBrain"] = true
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. job.CanModify = cloudbrain.CanModifyJob(ctx, job)
  91. job.CanDel = cloudbrain.CanDeleteJob(ctx, job)
  92. s, err := resource.GetCloudbrainSpec(job.ID)
  93. if err == nil {
  94. job.Spec = s
  95. }
  96. user, err := models.GetUserByID(job.UserID)
  97. if err == nil {
  98. tmpUser := &models.User{
  99. Name: user.Name,
  100. }
  101. job.User = tmpUser
  102. }
  103. ctx.JSON(200, job)
  104. }
  105. func StopAiSafetyTask(ctx *context.Context) {
  106. log.Info("start to stop the task.")
  107. var ID = ctx.Params(":jobid")
  108. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  109. result := make(map[string]interface{})
  110. result["result_code"] = "-1"
  111. if err != nil {
  112. log.Info("query task error.err=" + err.Error())
  113. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  114. result["msg"] = "No such task."
  115. ctx.JSON(200, result)
  116. return
  117. }
  118. if isTaskNotFinished(task.Status) {
  119. if task.Type == models.TypeCloudBrainTwo {
  120. log.Info("start to stop model arts task.")
  121. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  122. if err != nil {
  123. log.Info("stop failed.err=" + err.Error())
  124. }
  125. task.Status = string(models.JobStopped)
  126. if task.EndTime == 0 {
  127. task.EndTime = timeutil.TimeStampNow()
  128. }
  129. task.ComputeAndSetDuration()
  130. err = models.UpdateJob(task)
  131. if err != nil {
  132. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  133. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  134. ctx.JSON(200, result)
  135. return
  136. }
  137. //queryTaskStatusFromCloudbrainTwo(job)
  138. } else if task.Type == models.TypeCloudBrainOne {
  139. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  140. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  141. result["msg"] = "cloudbrain.Already_stopped"
  142. ctx.JSON(200, result)
  143. return
  144. }
  145. err := cloudbrain.StopJob(task.JobID)
  146. if err != nil {
  147. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  148. result["msg"] = "cloudbrain.Stopped_failed"
  149. ctx.JSON(200, result)
  150. return
  151. }
  152. task.Status = string(models.JobStopped)
  153. if task.EndTime == 0 {
  154. task.EndTime = timeutil.TimeStampNow()
  155. }
  156. task.ComputeAndSetDuration()
  157. err = models.UpdateJob(task)
  158. if err != nil {
  159. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  160. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  161. ctx.JSON(200, result)
  162. return
  163. }
  164. }
  165. } else {
  166. if task.Status == string(models.ModelSafetyTesting) {
  167. //修改为Failed
  168. task.Status = string(models.JobStopped)
  169. if task.EndTime == 0 {
  170. task.EndTime = timeutil.TimeStampNow()
  171. }
  172. task.ComputeAndSetDuration()
  173. err = models.UpdateJob(task)
  174. if err != nil {
  175. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  176. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  177. ctx.JSON(200, result)
  178. return
  179. }
  180. } else {
  181. log.Info("The job is finished. status=" + task.Status)
  182. }
  183. }
  184. result["result_code"] = "0"
  185. result["msg"] = "succeed"
  186. ctx.JSON(200, result)
  187. }
  188. func DelAiSafetyTask(ctx *context.Context) {
  189. var ID = ctx.Params(":jobid")
  190. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  191. if err != nil {
  192. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  193. ctx.ServerError("No such task.", err)
  194. return
  195. }
  196. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  197. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  198. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  199. return
  200. }
  201. if task.Type == models.TypeCloudBrainOne {
  202. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  203. }
  204. err = models.DeleteJob(task)
  205. if err != nil {
  206. ctx.ServerError(err.Error(), err)
  207. return
  208. }
  209. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  210. }
  211. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  212. log.Info("start to query safety task status.")
  213. if isTaskNotFinished(job.Status) {
  214. if job.Type == models.TypeCloudBrainTwo {
  215. queryTaskStatusFromCloudbrainTwo(job)
  216. } else if job.Type == models.TypeCloudBrainOne {
  217. queryTaskStatusFromCloudbrain(job)
  218. }
  219. } else {
  220. if job.Status == string(models.ModelSafetyTesting) {
  221. queryTaskStatusFromModelSafetyTestServer(job)
  222. } else {
  223. log.Info("The job is finished. status=" + job.Status)
  224. }
  225. }
  226. }
  227. func TimerHandleModelSafetyTestTask() {
  228. log.Info("start to TimerHandleModelSafetyTestTask")
  229. tasks, err := models.GetModelSafetyTestTask()
  230. if err == nil {
  231. if tasks != nil && len(tasks) > 0 {
  232. for _, job := range tasks {
  233. syncAiSafetyTaskStatus(job)
  234. }
  235. } else {
  236. log.Info("query running model safety test task 0.")
  237. }
  238. } else {
  239. log.Info("query running model safety test task err." + err.Error())
  240. }
  241. }
  242. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  243. log.Info("The task not finished,name=" + job.DisplayJobName)
  244. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  245. if err != nil {
  246. log.Info("query train job error." + err.Error())
  247. return
  248. }
  249. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  250. job.Duration = result.Duration / 1000
  251. job.TrainJobDuration = result.TrainJobDuration
  252. if job.StartTime == 0 && result.StartTime > 0 {
  253. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  254. }
  255. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  256. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  257. job.EndTime = job.StartTime.Add(job.Duration)
  258. }
  259. job.CorrectCreateUnix()
  260. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  261. log.Info("CloudbrainTwo task status=" + job.Status)
  262. err = models.UpdateJob(job)
  263. if err != nil {
  264. log.Error("UpdateJob failed:", err)
  265. }
  266. } else {
  267. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  268. job.Status = string(models.ModelSafetyTesting)
  269. err = models.UpdateJob(job)
  270. if err != nil {
  271. log.Error("UpdateJob failed:", err)
  272. }
  273. //send msg to beihang
  274. sendNPUInferenceResultToTest(job)
  275. }
  276. }
  277. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  278. log.Info("The task not finished,name=" + job.DisplayJobName)
  279. jobResult, err := cloudbrain.GetJob(job.JobID)
  280. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  281. if err != nil {
  282. log.Error("ConvertToJobResultPayload failed:", err)
  283. return
  284. }
  285. job.Status = result.JobStatus.State
  286. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  287. taskRoles := result.TaskRoles
  288. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  289. job.Status = taskRes.TaskStatuses[0].State
  290. }
  291. models.ParseAndSetDurationFromCloudBrainOne(result, job)
  292. //updateCloudBrainOneJobTime(job)
  293. log.Info("cloud brain one job status=" + job.Status)
  294. if result.JobStatus.State != string(models.JobSucceeded) {
  295. err = models.UpdateJob(job)
  296. if err != nil {
  297. log.Error("UpdateJob failed:", err)
  298. }
  299. } else {
  300. //
  301. job.Status = string(models.ModelSafetyTesting)
  302. err = models.UpdateJob(job)
  303. if err != nil {
  304. log.Error("UpdateJob failed:", err)
  305. }
  306. //send msg to beihang
  307. sendGPUInferenceResultToTest(job)
  308. }
  309. }
  310. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  311. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  312. if err == nil {
  313. if result.Code == "0" {
  314. if result.Data.Status == 1 {
  315. log.Info("The task is running....")
  316. } else {
  317. if result.Data.Code == 0 {
  318. job.ResultJson = result.Data.StandardJson
  319. job.Status = string(models.JobSucceeded)
  320. err = models.UpdateJob(job)
  321. if err != nil {
  322. log.Error("UpdateJob failed:", err)
  323. }
  324. } else {
  325. job.ResultJson = result.Data.Msg
  326. job.Status = string(models.JobFailed)
  327. err = models.UpdateJob(job)
  328. if err != nil {
  329. log.Error("UpdateJob failed:", err)
  330. }
  331. }
  332. }
  333. } else {
  334. log.Info("The task is failed.")
  335. job.Status = string(models.JobFailed)
  336. err = models.UpdateJob(job)
  337. if err != nil {
  338. log.Error("UpdateJob failed:", err)
  339. }
  340. }
  341. } else {
  342. log.Info("The task not found.....")
  343. }
  344. }
  345. func getAisafetyTaskReq(job *models.Cloudbrain) aisafety.TaskReq {
  346. datasetname := job.DatasetName
  347. datasetnames := strings.Split(datasetname, ";")
  348. indicator := job.LabelName
  349. EvalContent := "test1"
  350. if job.Description != "" {
  351. EvalContent = job.Description
  352. }
  353. req := aisafety.TaskReq{
  354. UnionId: job.JobID,
  355. EvalName: job.DisplayJobName,
  356. EvalContent: EvalContent,
  357. TLPath: "test1",
  358. Indicators: strings.Split(indicator, ";"),
  359. CDName: strings.Split(datasetnames[1], ".")[0],
  360. BDName: strings.Split(datasetnames[0], ".")[0] + "基础数据集",
  361. }
  362. log.Info("CDName=" + req.CDName)
  363. log.Info("BDName=" + req.BDName)
  364. return req
  365. }
  366. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  367. log.Info("send sendGPUInferenceResultToTest")
  368. req := getAisafetyTaskReq(job)
  369. resultDir := "/result"
  370. prefix := setting.CBCodePathPrefix + job.JobName + resultDir
  371. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  372. if err != nil {
  373. log.Error("query cloudbrain one model failed: %v", err)
  374. return
  375. }
  376. jsonContent := ""
  377. for _, file := range files {
  378. if strings.HasSuffix(file.FileName, "result.json") {
  379. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  380. log.Info("path=" + path)
  381. reader, err := os.Open(path)
  382. defer reader.Close()
  383. if err == nil {
  384. r := bufio.NewReader(reader)
  385. for {
  386. line, error := r.ReadString('\n')
  387. jsonContent += line
  388. if error == io.EOF {
  389. log.Info("read file completed.")
  390. break
  391. }
  392. if error != nil {
  393. log.Info("read file error." + error.Error())
  394. break
  395. }
  396. }
  397. }
  398. break
  399. }
  400. }
  401. if jsonContent != "" {
  402. sendHttpReqToBeihang(job, jsonContent, req)
  403. } else {
  404. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  405. }
  406. }
  407. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  408. log.Info("start to sendNPUInferenceResultToTest")
  409. req := getAisafetyTaskReq(job)
  410. jsonContent := ""
  411. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  412. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  413. resultPath = resultPath[1:]
  414. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  415. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  416. if err != nil {
  417. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  418. } else {
  419. defer body.Close()
  420. var data []byte
  421. p := make([]byte, 4096)
  422. var readErr error
  423. var readCount int
  424. for {
  425. readCount, readErr = body.Read(p)
  426. if readCount > 0 {
  427. data = append(data, p[:readCount]...)
  428. }
  429. if readErr != nil || readCount == 0 {
  430. break
  431. }
  432. }
  433. jsonContent = string(data)
  434. }
  435. if jsonContent != "" {
  436. sendHttpReqToBeihang(job, jsonContent, req)
  437. } else {
  438. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  439. }
  440. }
  441. func updateJobFailed(job *models.Cloudbrain, msg string) {
  442. log.Info("The json is null. so set it failed.")
  443. //update task failed.
  444. job.Status = string(models.ModelArtsTrainJobFailed)
  445. job.ResultJson = msg
  446. err := models.UpdateJob(job)
  447. if err != nil {
  448. log.Error("UpdateJob failed:", err)
  449. }
  450. }
  451. func sendHttpReqToBeihang(job *models.Cloudbrain, jsonContent string, req aisafety.TaskReq) {
  452. log.Info("start to send beihang ...")
  453. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  454. if err == nil {
  455. //update serial no to db
  456. job.PreVersionName = serialNo
  457. err = models.UpdateJob(job)
  458. if err != nil {
  459. log.Error("UpdateJob failed:", err)
  460. }
  461. }
  462. }
  463. func isTaskNotFinished(status string) bool {
  464. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  465. return true
  466. }
  467. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  468. return true
  469. }
  470. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  471. return true
  472. }
  473. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  474. return true
  475. }
  476. return false
  477. }
  478. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  479. t := time.Now()
  480. ctx.Data["PageIsCloudBrain"] = true
  481. ctx.Data["IsCreate"] = true
  482. ctx.Data["type"] = models.TypeCloudBrainOne
  483. ctx.Data["compute_resource"] = models.GPUResource
  484. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  485. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  486. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  487. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  488. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  489. log.Info("GPUBaseDataSetName=" + setting.ModelSafetyTest.GPUBaseDataSetName)
  490. log.Info("GPUBaseDataSetUUID=" + setting.ModelSafetyTest.GPUBaseDataSetUUID)
  491. log.Info("GPUCombatDataSetName=" + setting.ModelSafetyTest.GPUCombatDataSetName)
  492. log.Info("GPUCombatDataSetUUID=" + setting.ModelSafetyTest.GPUCombatDataSetUUID)
  493. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  494. ctx.Data["display_job_name"] = displayJobName
  495. prepareCloudbrainOneSpecs(ctx)
  496. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  497. if queuesDetail != nil {
  498. ctx.Data["QueuesDetail"] = queuesDetail
  499. }
  500. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  501. }
  502. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  503. t := time.Now()
  504. ctx.Data["PageIsCloudBrain"] = true
  505. ctx.Data["IsCreate"] = true
  506. ctx.Data["type"] = models.TypeCloudBrainTwo
  507. ctx.Data["compute_resource"] = models.NPUResource
  508. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  509. ctx.Data["display_job_name"] = displayJobName
  510. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  511. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  512. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  513. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  514. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  515. log.Info("NPUBaseDataSetName=" + setting.ModelSafetyTest.NPUBaseDataSetName)
  516. log.Info("NPUBaseDataSetUUID=" + setting.ModelSafetyTest.NPUBaseDataSetUUID)
  517. log.Info("NPUCombatDataSetName=" + setting.ModelSafetyTest.NPUCombatDataSetName)
  518. log.Info("NPUCombatDataSetUUID=" + setting.ModelSafetyTest.NPUCombatDataSetUUID)
  519. var resourcePools modelarts.ResourcePool
  520. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  521. ctx.ServerError("json.Unmarshal failed:", err)
  522. }
  523. ctx.Data["resource_pools"] = resourcePools.Info
  524. var engines modelarts.Engine
  525. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  526. ctx.ServerError("json.Unmarshal failed:", err)
  527. }
  528. ctx.Data["engines"] = engines.Info
  529. var versionInfos modelarts.VersionInfo
  530. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  531. ctx.ServerError("json.Unmarshal failed:", err)
  532. }
  533. ctx.Data["engine_versions"] = versionInfos.Version
  534. prepareCloudbrainTwoInferenceSpecs(ctx)
  535. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  536. ctx.Data["WaitCount"] = waitCount
  537. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  538. }
  539. func AiSafetyCreateForPost(ctx *context.Context) {
  540. ctx.Data["PageIsCloudBrain"] = true
  541. displayJobName := ctx.Query("display_job_name")
  542. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  543. taskType := ctx.QueryInt("type")
  544. description := ctx.Query("description")
  545. ctx.Data["type"] = taskType
  546. ctx.Data["displayJobName"] = displayJobName
  547. ctx.Data["description"] = description
  548. repo := ctx.Repo.Repository
  549. tpname := tplCloudBrainModelSafetyNewNpu
  550. if taskType == models.TypeCloudBrainOne {
  551. tpname = tplCloudBrainModelSafetyNewGpu
  552. }
  553. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  554. if err == nil {
  555. if len(tasks) != 0 {
  556. log.Error("the job name did already exist", ctx.Data["MsgID"])
  557. modelSafetyNewDataPrepare(ctx)
  558. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  559. return
  560. }
  561. } else {
  562. if !models.IsErrJobNotExist(err) {
  563. log.Error("system error, %v", err, ctx.Data["MsgID"])
  564. modelSafetyNewDataPrepare(ctx)
  565. ctx.RenderWithErr("system error", tpname, nil)
  566. return
  567. }
  568. }
  569. if !jobNamePattern.MatchString(jobName) {
  570. modelSafetyNewDataPrepare(ctx)
  571. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  572. return
  573. }
  574. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  575. if err != nil {
  576. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  577. modelSafetyNewDataPrepare(ctx)
  578. ctx.RenderWithErr("system error", tpname, nil)
  579. return
  580. } else {
  581. if count >= 1 {
  582. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  583. modelSafetyNewDataPrepare(ctx)
  584. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  585. return
  586. }
  587. }
  588. BootFile := ctx.Query("boot_file")
  589. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  590. if err != nil || !bootFileExist {
  591. log.Error("Get bootfile error:", err)
  592. modelSafetyNewDataPrepare(ctx)
  593. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  594. return
  595. }
  596. if taskType == models.TypeCloudBrainTwo {
  597. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  598. err = createForNPU(ctx, jobName)
  599. } else if taskType == models.TypeCloudBrainOne {
  600. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  601. err = createForGPU(ctx, jobName)
  602. }
  603. if err != nil {
  604. modelSafetyNewDataPrepare(ctx)
  605. ctx.RenderWithErr(err.Error(), tpname, nil)
  606. } else {
  607. log.Info("to redirect...")
  608. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  609. }
  610. }
  611. func createForNPU(ctx *context.Context, jobName string) error {
  612. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  613. BootFile := ctx.Query("boot_file")
  614. displayJobName := ctx.Query("display_job_name")
  615. description := ctx.Query("description")
  616. srcDataset := ctx.Query("src_dataset") //uuid
  617. combatDataset := ctx.Query("combat_dataset") //uuid
  618. evaluationIndex := ctx.Query("evaluation_index")
  619. Params := ctx.Query("run_para_list")
  620. specId := ctx.QueryInt64("spec_id")
  621. engineID := ctx.QueryInt("engine_id")
  622. log.Info("engine_id=" + fmt.Sprint(engineID))
  623. poolID := ctx.Query("pool_id")
  624. repo := ctx.Repo.Repository
  625. trainUrl := ctx.Query("train_url")
  626. modelName := ctx.Query("model_name")
  627. modelVersion := ctx.Query("model_version")
  628. ckptName := ctx.Query("ckpt_name")
  629. ckptUrl := "/" + trainUrl + ckptName
  630. log.Info("ckpt url:" + ckptUrl)
  631. FlavorName := ctx.Query("flaver_names")
  632. EngineName := ctx.Query("engine_names")
  633. isLatestVersion := modelarts.IsLatestVersion
  634. VersionCount := modelarts.VersionCountOne
  635. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  636. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  637. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  638. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  639. log.Info("ckpt url:" + ckptUrl)
  640. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  641. JobType: models.JobTypeInference,
  642. ComputeResource: models.NPU,
  643. Cluster: models.OpenICluster,
  644. AiCenterCode: models.AICenterOfCloudBrainTwo})
  645. if err != nil || spec == nil {
  646. //ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  647. return errors.New("Resource specification not available")
  648. }
  649. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  650. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  651. return errors.New(ctx.Tr("points.insufficient_points_balance"))
  652. }
  653. //todo: del the codeLocalPath
  654. _, err = ioutil.ReadDir(codeLocalPath)
  655. if err == nil {
  656. os.RemoveAll(codeLocalPath)
  657. }
  658. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  659. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  660. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  661. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  662. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  663. }
  664. //todo: upload code (send to file_server todo this work?)
  665. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  666. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  667. return errors.New("Failed to obsMkdir_result")
  668. }
  669. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  670. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  671. return errors.New("Failed to obsMkdir_log")
  672. }
  673. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  674. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  675. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  676. }
  677. var parameters models.Parameters
  678. param := make([]models.Parameter, 0)
  679. param = append(param, models.Parameter{
  680. Label: modelarts.ResultUrl,
  681. Value: "s3:/" + resultObsPath,
  682. }, models.Parameter{
  683. Label: modelarts.CkptUrl,
  684. Value: "s3:/" + ckptUrl,
  685. })
  686. uuid := srcDataset + ";" + combatDataset
  687. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  688. if err != nil {
  689. return err
  690. }
  691. dataPath := dataUrl
  692. jsondatas, err := json.Marshal(datasUrlList)
  693. if err != nil {
  694. log.Error("Failed to Marshal: %v", err)
  695. return err
  696. }
  697. if isMultiDataset {
  698. param = append(param, models.Parameter{
  699. Label: modelarts.MultiDataUrl,
  700. Value: string(jsondatas),
  701. })
  702. }
  703. existDeviceTarget := false
  704. if len(Params) != 0 {
  705. err := json.Unmarshal([]byte(Params), &parameters)
  706. if err != nil {
  707. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  708. return errors.New("运行参数错误")
  709. }
  710. for _, parameter := range parameters.Parameter {
  711. if parameter.Label == modelarts.DeviceTarget {
  712. existDeviceTarget = true
  713. }
  714. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  715. param = append(param, models.Parameter{
  716. Label: parameter.Label,
  717. Value: parameter.Value,
  718. })
  719. }
  720. }
  721. }
  722. if !existDeviceTarget {
  723. param = append(param, models.Parameter{
  724. Label: modelarts.DeviceTarget,
  725. Value: modelarts.Ascend,
  726. })
  727. }
  728. req := &modelarts.GenerateInferenceJobReq{
  729. JobName: jobName,
  730. DisplayJobName: displayJobName,
  731. DataUrl: dataPath,
  732. Description: description,
  733. CodeObsPath: codeObsPath,
  734. BootFileUrl: codeObsPath + BootFile,
  735. BootFile: BootFile,
  736. TrainUrl: trainUrl,
  737. WorkServerNumber: 1,
  738. EngineID: int64(engineID),
  739. LogUrl: logObsPath,
  740. PoolID: poolID,
  741. Uuid: uuid,
  742. Parameters: param, //modelarts train parameters
  743. CommitID: commitID,
  744. BranchName: cloudbrain.DefaultBranchName,
  745. Params: Params,
  746. FlavorName: FlavorName,
  747. EngineName: EngineName,
  748. LabelName: evaluationIndex,
  749. IsLatestVersion: isLatestVersion,
  750. VersionCount: VersionCount,
  751. TotalVersionCount: modelarts.TotalVersionCount,
  752. ModelName: modelName,
  753. ModelVersion: modelVersion,
  754. CkptName: ckptName,
  755. ResultUrl: resultObsPath,
  756. Spec: spec,
  757. DatasetName: datasetNames,
  758. JobType: string(models.JobTypeModelSafety),
  759. }
  760. err = modelarts.GenerateInferenceJob(ctx, req)
  761. if err != nil {
  762. log.Error("GenerateTrainJob failed:%v", err.Error())
  763. return err
  764. }
  765. return nil
  766. }
  767. func createForGPU(ctx *context.Context, jobName string) error {
  768. BootFile := ctx.Query("boot_file")
  769. displayJobName := ctx.Query("display_job_name")
  770. description := ctx.Query("description")
  771. image := strings.TrimSpace(ctx.Query("image"))
  772. srcDataset := ctx.Query("src_dataset") //uuid
  773. combatDataset := ctx.Query("combat_dataset") //uuid
  774. evaluationIndex := ctx.Query("evaluation_index")
  775. Params := ctx.Query("run_para_list")
  776. specId := ctx.QueryInt64("spec_id")
  777. TrainUrl := ctx.Query("train_url")
  778. CkptName := ctx.Query("ckpt_name")
  779. modelName := ctx.Query("model_name")
  780. modelVersion := ctx.Query("model_version")
  781. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  782. log.Info("ckpt url:" + ckptUrl)
  783. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  784. JobType: models.JobTypeBenchmark,
  785. ComputeResource: models.GPU,
  786. Cluster: models.OpenICluster,
  787. AiCenterCode: models.AICenterOfCloudBrainOne})
  788. if err != nil || spec == nil {
  789. return errors.New("Resource specification not available")
  790. }
  791. repo := ctx.Repo.Repository
  792. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  793. os.RemoveAll(codePath)
  794. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  795. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  796. return errors.New("system error")
  797. }
  798. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  799. if err != nil {
  800. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  801. return errors.New("system error")
  802. }
  803. uuid := srcDataset + ";" + combatDataset
  804. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  805. log.Info("uuid=" + uuid)
  806. if err != nil {
  807. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  808. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  809. }
  810. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  811. if err != nil {
  812. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  813. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  814. }
  815. log.Info("Command=" + command)
  816. req := cloudbrain.GenerateCloudBrainTaskReq{
  817. Ctx: ctx,
  818. DisplayJobName: displayJobName,
  819. JobName: jobName,
  820. Image: image,
  821. Command: command,
  822. Uuids: uuid,
  823. DatasetNames: datasetNames,
  824. DatasetInfos: datasetInfos,
  825. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  826. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  827. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  828. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  829. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  830. JobType: string(models.JobTypeModelSafety),
  831. Description: description,
  832. BranchName: cloudbrain.DefaultBranchName,
  833. BootFile: BootFile,
  834. Params: Params,
  835. CommitID: "",
  836. ModelName: modelName,
  837. ModelVersion: modelVersion,
  838. CkptName: CkptName,
  839. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  840. Spec: spec,
  841. LabelName: evaluationIndex,
  842. }
  843. err = cloudbrain.GenerateTask(req)
  844. if err != nil {
  845. return err
  846. }
  847. return nil
  848. }
  849. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  850. var command string
  851. bootFile := strings.TrimSpace(BootFile)
  852. if !strings.HasSuffix(bootFile, ".py") {
  853. log.Error("bootFile(%s) format error", bootFile)
  854. return command, errors.New("bootFile format error")
  855. }
  856. var parameters models.Parameters
  857. var param string
  858. if len(params) != 0 {
  859. err := json.Unmarshal([]byte(params), &parameters)
  860. if err != nil {
  861. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  862. return command, err
  863. }
  864. for _, parameter := range parameters.Parameter {
  865. param += " --" + parameter.Label + "=" + parameter.Value
  866. }
  867. }
  868. param += " --ckpt_url=" + cloudbrain.ModelMountPath + "/" + CkptName
  869. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  870. return command, nil
  871. }
  872. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  873. ctx.Data["PageIsCloudBrain"] = true
  874. ctx.Data["type"] = ctx.QueryInt("type")
  875. ctx.Data["boot_file"] = ctx.Query("boot_file")
  876. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  877. ctx.Data["description"] = ctx.Query("description")
  878. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  879. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  880. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  881. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  882. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  883. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  884. ctx.Data["train_url"] = ctx.Query("train_url")
  885. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  886. ctx.Data["train_url"] = ctx.Query("train_url")
  887. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  888. ctx.Data["model_name"] = ctx.Query("model_name")
  889. ctx.Data["model_version"] = ctx.Query("model_version")
  890. if ctx.QueryInt("type") == models.TypeCloudBrainOne {
  891. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  892. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  893. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  894. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  895. } else {
  896. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  897. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  898. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  899. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  900. }
  901. prepareCloudbrainOneSpecs(ctx)
  902. prepareCloudbrainTwoInferenceSpecs(ctx)
  903. return nil
  904. }
  905. func getJsonContent(url string) (string, error) {
  906. resp, err := http.Get(url)
  907. if err != nil || resp.StatusCode != 200 {
  908. log.Info("Get organizations url error=" + err.Error())
  909. return "", err
  910. }
  911. bytes, err := ioutil.ReadAll(resp.Body)
  912. resp.Body.Close()
  913. if err != nil {
  914. log.Info("Get organizations url error=" + err.Error())
  915. return "", err
  916. }
  917. str := string(bytes)
  918. //log.Info("json str =" + str)
  919. return str, nil
  920. }