You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 42 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/grampus"
  20. "code.gitea.io/gitea/modules/log"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/storage"
  24. "code.gitea.io/gitea/modules/timeutil"
  25. "code.gitea.io/gitea/modules/util"
  26. "code.gitea.io/gitea/services/cloudbrain/resource"
  27. "code.gitea.io/gitea/services/reward/point/account"
  28. uuid "github.com/satori/go.uuid"
  29. )
  30. const (
  31. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  32. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  33. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  34. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  35. tplModelSafetyTestShow = "repo/modelsafety/show"
  36. )
  37. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  38. log.Info("start to create CloudBrainAiSafetyCreate")
  39. uuid := uuid.NewV4()
  40. id := uuid.String()
  41. seriaNoParas := ctx.Query("serialNo")
  42. fileName := ctx.Query("fileName")
  43. //if jobType == string(models.JobTypeBenchmark) {
  44. req := aisafety.TaskReq{
  45. UnionId: id,
  46. EvalName: "test1",
  47. EvalContent: "test1",
  48. TLPath: "test1",
  49. Indicators: []string{"ACC", "ASS"},
  50. CDName: "CIFAR10_1000_FGSM",
  51. BDName: "CIFAR10_1000基础数据集",
  52. }
  53. aisafety.GetAlgorithmList()
  54. if seriaNoParas != "" {
  55. aisafety.GetTaskStatus(seriaNoParas)
  56. } else {
  57. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  58. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  59. if err == nil {
  60. log.Info("serialNo=" + serialNo)
  61. time.Sleep(time.Duration(2) * time.Second)
  62. aisafety.GetTaskStatus(serialNo)
  63. } else {
  64. log.Info("CreateSafetyTask error," + err.Error())
  65. }
  66. }
  67. }
  68. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  69. if job == nil {
  70. log.Error("GetCloudbrainByJobID failed")
  71. return
  72. }
  73. syncAiSafetyTaskStatus(job)
  74. }
  75. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  76. ctx.Data["id"] = ctx.Params(":jobid")
  77. ctx.HTML(200, tplModelSafetyTestShow)
  78. }
  79. func GetAiSafetyTask(ctx *context.Context) {
  80. var ID = ctx.Params(":jobid")
  81. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  82. if err != nil {
  83. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  84. return
  85. }
  86. syncAiSafetyTaskStatus(job)
  87. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  88. job.BenchmarkType = "安全评测"
  89. job.BenchmarkTypeName = "Image Classification"
  90. ctx.JSON(200, job)
  91. }
  92. func StopAiSafetyTask(ctx *context.Context) {
  93. log.Info("start to stop the task.")
  94. var ID = ctx.Params(":jobid")
  95. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  96. result := make(map[string]interface{})
  97. result["code"] = -1
  98. if err != nil {
  99. log.Info("query task error.err=" + err.Error())
  100. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  101. result["msg"] = "No such task."
  102. ctx.JSON(200, result)
  103. return
  104. }
  105. if isTaskNotFinished(task.Status) {
  106. if task.Type == models.TypeCloudBrainTwo {
  107. log.Info("start to stop model arts task.")
  108. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  109. if err != nil {
  110. log.Info("stop failed.err=" + err.Error())
  111. }
  112. task.Status = string(models.JobStopped)
  113. if task.EndTime == 0 {
  114. task.EndTime = timeutil.TimeStampNow()
  115. }
  116. task.ComputeAndSetDuration()
  117. err = models.UpdateJob(task)
  118. if err != nil {
  119. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  120. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  121. ctx.JSON(200, result)
  122. return
  123. }
  124. //queryTaskStatusFromCloudbrainTwo(job)
  125. } else if task.Type == models.TypeCloudBrainOne {
  126. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  127. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  128. result["msg"] = "cloudbrain.Already_stopped"
  129. ctx.JSON(200, result)
  130. return
  131. }
  132. err := cloudbrain.StopJob(task.JobID)
  133. if err != nil {
  134. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  135. result["msg"] = "cloudbrain.Stopped_failed"
  136. ctx.JSON(200, result)
  137. return
  138. }
  139. task.Status = string(models.JobStopped)
  140. if task.EndTime == 0 {
  141. task.EndTime = timeutil.TimeStampNow()
  142. }
  143. task.ComputeAndSetDuration()
  144. err = models.UpdateJob(task)
  145. if err != nil {
  146. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  147. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  148. ctx.JSON(200, result)
  149. return
  150. }
  151. }
  152. } else {
  153. if task.Status == string(models.ModelSafetyTesting) {
  154. //修改为Failed
  155. task.Status = string(models.JobStopped)
  156. if task.EndTime == 0 {
  157. task.EndTime = timeutil.TimeStampNow()
  158. }
  159. task.ComputeAndSetDuration()
  160. err = models.UpdateJob(task)
  161. if err != nil {
  162. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  163. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  164. ctx.JSON(200, result)
  165. return
  166. }
  167. } else {
  168. log.Info("The job is finished. status=" + task.Status)
  169. }
  170. }
  171. }
  172. func DelAiSafetyTask(ctx *context.Context) {
  173. var ID = ctx.Params(":jobid")
  174. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  175. result := make(map[string]interface{})
  176. result["code"] = 1
  177. if err != nil {
  178. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  179. result["msg"] = "No such task."
  180. ctx.JSON(200, result)
  181. return
  182. }
  183. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  184. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  185. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  186. ctx.JSON(200, result)
  187. return
  188. }
  189. if task.Type == models.TypeCloudBrainOne {
  190. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  191. }
  192. err = models.DeleteJob(task)
  193. if err != nil {
  194. result["msg"] = err.Error()
  195. ctx.JSON(200, result)
  196. return
  197. }
  198. result["code"] = 0
  199. result["msg"] = "Succeed"
  200. ctx.JSON(200, result)
  201. }
  202. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  203. log.Info("start to query safety task status.")
  204. if isTaskNotFinished(job.Status) {
  205. if job.Type == models.TypeCloudBrainTwo {
  206. queryTaskStatusFromCloudbrainTwo(job)
  207. } else if job.Type == models.TypeCloudBrainOne {
  208. queryTaskStatusFromCloudbrain(job)
  209. } else if job.Type == models.TypeC2Net {
  210. queryTaskStatusFromGrampus(job)
  211. }
  212. } else {
  213. if job.Status == string(models.ModelSafetyTesting) {
  214. queryTaskStatusFromModelSafetyTestServer(job)
  215. } else {
  216. log.Info("The job is finished. status=" + job.Status)
  217. }
  218. }
  219. }
  220. func TimerHandleModelSafetyTestTask() {
  221. log.Info("start to TimerHandleModelSafetyTestTask")
  222. tasks, err := models.GetModelSafetyTestTask()
  223. if err == nil {
  224. if tasks != nil && len(tasks) > 0 {
  225. for _, job := range tasks {
  226. syncAiSafetyTaskStatus(job)
  227. }
  228. } else {
  229. log.Info("query running model safety test task 0.")
  230. }
  231. } else {
  232. log.Info("query running model safety test task err." + err.Error())
  233. }
  234. }
  235. func queryTaskStatusFromGrampus(task *models.Cloudbrain) {
  236. log.Info("The task not finished,name=" + task.DisplayJobName)
  237. if task.DeletedAt.IsZero() { //normal record
  238. result, err := grampus.GetJob(task.JobID)
  239. resultJson, _ := json.Marshal(result)
  240. log.Info("resultJson=" + string(resultJson))
  241. if err != nil {
  242. log.Error("GetJob failed:" + err.Error())
  243. return
  244. }
  245. if result != nil {
  246. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  247. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  248. }
  249. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  250. if task.Status != models.GrampusStatusSucceeded {
  251. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  252. task.Duration = result.JobInfo.RunSec
  253. if task.Duration < 0 {
  254. task.Duration = 0
  255. }
  256. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  257. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  258. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  259. }
  260. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  261. task.EndTime = task.StartTime.Add(task.Duration)
  262. }
  263. task.CorrectCreateUnix()
  264. err = models.UpdateJob(task)
  265. if err != nil {
  266. log.Error("UpdateJob failed:" + err.Error())
  267. }
  268. }
  269. } else {
  270. task.Status = string(models.ModelSafetyTesting)
  271. err = models.UpdateJob(task)
  272. if err != nil {
  273. log.Error("UpdateJob failed:", err)
  274. }
  275. //send msg to beihang
  276. sendGPUInferenceResultToTest(task)
  277. }
  278. }
  279. }
  280. }
  281. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  282. log.Info("The task not finished,name=" + job.DisplayJobName)
  283. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  284. if err != nil {
  285. log.Info("query train job error." + err.Error())
  286. return
  287. }
  288. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  289. job.Duration = result.Duration
  290. job.TrainJobDuration = result.TrainJobDuration
  291. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  292. err = models.UpdateJob(job)
  293. if err != nil {
  294. log.Error("UpdateJob failed:", err)
  295. }
  296. } else {
  297. job.Status = string(models.ModelSafetyTesting)
  298. err = models.UpdateJob(job)
  299. if err != nil {
  300. log.Error("UpdateJob failed:", err)
  301. }
  302. //send msg to beihang
  303. sendNPUInferenceResultToTest(job)
  304. }
  305. }
  306. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  307. datasetname := job.DatasetName
  308. datasetnames := strings.Split(datasetname, ";")
  309. indicator := job.LabelName
  310. req := aisafety.TaskReq{
  311. UnionId: job.JobID,
  312. EvalName: job.DisplayJobName,
  313. EvalContent: job.Description,
  314. TLPath: "test",
  315. Indicators: strings.Split(indicator, ";"),
  316. CDName: datasetnames[1],
  317. BDName: datasetnames[0],
  318. }
  319. jsonContent := ""
  320. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  321. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  322. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  323. if err != nil {
  324. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  325. } else {
  326. defer body.Close()
  327. var data []byte
  328. p := make([]byte, 4096)
  329. var readErr error
  330. var readCount int
  331. for {
  332. readCount, readErr = body.Read(p)
  333. if readCount > 0 {
  334. data = append(data, p[:readCount]...)
  335. }
  336. if readErr != nil || readCount == 0 {
  337. break
  338. }
  339. }
  340. jsonContent = string(data)
  341. }
  342. if jsonContent != "" {
  343. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  344. if err == nil {
  345. //update serial no to db
  346. job.PreVersionName = serialNo
  347. err = models.UpdateJob(job)
  348. if err != nil {
  349. log.Error("UpdateJob failed:", err)
  350. }
  351. }
  352. } else {
  353. log.Info("The json is null. so set it failed.")
  354. //update task failed.
  355. job.Status = string(models.ModelArtsTrainJobFailed)
  356. err := models.UpdateJob(job)
  357. if err != nil {
  358. log.Error("UpdateJob failed:", err)
  359. }
  360. }
  361. }
  362. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  363. log.Info("The task not finished,name=" + job.DisplayJobName)
  364. jobResult, err := cloudbrain.GetJob(job.JobID)
  365. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  366. if err != nil {
  367. log.Error("ConvertToJobResultPayload failed:", err)
  368. return
  369. }
  370. job.Status = result.JobStatus.State
  371. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  372. taskRoles := result.TaskRoles
  373. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  374. job.Status = taskRes.TaskStatuses[0].State
  375. }
  376. if result.JobStatus.State != string(models.JobSucceeded) {
  377. err = models.UpdateJob(job)
  378. if err != nil {
  379. log.Error("UpdateJob failed:", err)
  380. }
  381. } else {
  382. //
  383. job.Status = string(models.ModelSafetyTesting)
  384. err = models.UpdateJob(job)
  385. if err != nil {
  386. log.Error("UpdateJob failed:", err)
  387. }
  388. //send msg to beihang
  389. sendGPUInferenceResultToTest(job)
  390. }
  391. }
  392. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  393. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  394. if err == nil {
  395. if result.Code == "0" {
  396. if result.Data.Status == 1 {
  397. log.Info("The task is running....")
  398. } else {
  399. if result.Data.Code == 0 {
  400. job.ResultJson = result.Data.StandardJson
  401. err = models.UpdateJob(job)
  402. if err != nil {
  403. log.Error("UpdateJob failed:", err)
  404. }
  405. }
  406. }
  407. } else {
  408. log.Info("The task is failed.")
  409. job.Status = string(models.JobFailed)
  410. err = models.UpdateJob(job)
  411. if err != nil {
  412. log.Error("UpdateJob failed:", err)
  413. }
  414. }
  415. } else {
  416. log.Info("The task not found.....")
  417. }
  418. }
  419. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  420. datasetname := job.DatasetName
  421. datasetnames := strings.Split(datasetname, ";")
  422. indicator := job.LabelName
  423. req := aisafety.TaskReq{
  424. UnionId: job.JobID,
  425. EvalName: job.DisplayJobName,
  426. EvalContent: job.Description,
  427. TLPath: "test",
  428. Indicators: strings.Split(indicator, ";"),
  429. CDName: datasetnames[1],
  430. BDName: datasetnames[0],
  431. }
  432. resultDir := "/model"
  433. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  434. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  435. if err != nil {
  436. log.Error("query cloudbrain one model failed: %v", err)
  437. return
  438. }
  439. jsonContent := ""
  440. for _, file := range files {
  441. if strings.HasSuffix(file.FileName, "result.json") {
  442. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  443. log.Info("path=" + path)
  444. reader, err := os.Open(path)
  445. defer reader.Close()
  446. if err == nil {
  447. r := bufio.NewReader(reader)
  448. for {
  449. line, error := r.ReadString('\n')
  450. if error == io.EOF {
  451. log.Info("read file completed.")
  452. break
  453. }
  454. if error != nil {
  455. log.Info("read file error." + error.Error())
  456. break
  457. }
  458. jsonContent += line
  459. }
  460. }
  461. break
  462. }
  463. }
  464. if jsonContent != "" {
  465. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  466. if err == nil {
  467. //update serial no to db
  468. job.PreVersionName = serialNo
  469. err = models.UpdateJob(job)
  470. if err != nil {
  471. log.Error("UpdateJob failed:", err)
  472. }
  473. }
  474. } else {
  475. log.Info("The json is null. so set it failed.")
  476. //update task failed.
  477. job.Status = string(models.JobFailed)
  478. err = models.UpdateJob(job)
  479. if err != nil {
  480. log.Error("UpdateJob failed:", err)
  481. }
  482. }
  483. }
  484. func isTaskNotFinished(status string) bool {
  485. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  486. return true
  487. }
  488. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  489. return true
  490. }
  491. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  492. return true
  493. }
  494. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  495. return true
  496. }
  497. return false
  498. }
  499. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  500. t := time.Now()
  501. ctx.Data["PageIsCloudBrain"] = true
  502. ctx.Data["IsCreate"] = true
  503. ctx.Data["type"] = models.TypeCloudBrainOne
  504. ctx.Data["compute_resource"] = models.GPUResource
  505. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  506. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  507. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  508. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  509. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  510. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  511. ctx.Data["display_job_name"] = displayJobName
  512. prepareCloudbrainOneSpecs(ctx)
  513. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  514. if queuesDetail != nil {
  515. ctx.Data["QueuesDetail"] = queuesDetail
  516. }
  517. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  518. }
  519. func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) {
  520. ctx.Data["PageIsCloudBrain"] = true
  521. ctx.Data["IsCreate"] = true
  522. ctx.Data["type"] = models.TypeC2Net
  523. ctx.Data["compute_resource"] = models.GPUResource
  524. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  525. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  526. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  527. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  528. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  529. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  530. if err != nil {
  531. ctx.ServerError("get new train-job info failed", err)
  532. return
  533. }
  534. ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu)
  535. }
  536. func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) {
  537. ctx.Data["PageIsCloudBrain"] = true
  538. ctx.Data["IsCreate"] = true
  539. ctx.Data["type"] = models.TypeC2Net
  540. ctx.Data["compute_resource"] = models.NPUResource
  541. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  542. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  543. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  544. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  545. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  546. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  547. if err != nil {
  548. ctx.ServerError("get new train-job info failed", err)
  549. return
  550. }
  551. ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu)
  552. }
  553. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  554. t := time.Now()
  555. ctx.Data["PageIsCloudBrain"] = true
  556. ctx.Data["IsCreate"] = true
  557. ctx.Data["type"] = models.TypeCloudBrainTwo
  558. ctx.Data["compute_resource"] = models.NPUResource
  559. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  560. ctx.Data["display_job_name"] = displayJobName
  561. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  562. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  563. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  564. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  565. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  566. var resourcePools modelarts.ResourcePool
  567. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  568. ctx.ServerError("json.Unmarshal failed:", err)
  569. }
  570. ctx.Data["resource_pools"] = resourcePools.Info
  571. var engines modelarts.Engine
  572. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  573. ctx.ServerError("json.Unmarshal failed:", err)
  574. }
  575. ctx.Data["engines"] = engines.Info
  576. var versionInfos modelarts.VersionInfo
  577. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  578. ctx.ServerError("json.Unmarshal failed:", err)
  579. }
  580. ctx.Data["engine_versions"] = versionInfos.Version
  581. prepareCloudbrainTwoInferenceSpecs(ctx)
  582. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  583. ctx.Data["WaitCount"] = waitCount
  584. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  585. }
  586. func AiSafetyCreateForPost(ctx *context.Context) {
  587. ctx.Data["PageIsCloudBrain"] = true
  588. displayJobName := ctx.Query("display_job_name")
  589. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  590. taskType := ctx.QueryInt("type")
  591. description := ctx.Query("description")
  592. ctx.Data["type"] = taskType
  593. ctx.Data["displayJobName"] = displayJobName
  594. ctx.Data["description"] = description
  595. repo := ctx.Repo.Repository
  596. tpname := tplCloudBrainModelSafetyNewNpu
  597. if taskType == models.TypeCloudBrainOne {
  598. tpname = tplCloudBrainModelSafetyNewGpu
  599. }
  600. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  601. if err == nil {
  602. if len(tasks) != 0 {
  603. log.Error("the job name did already exist", ctx.Data["MsgID"])
  604. modelSafetyNewDataPrepare(ctx)
  605. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  606. return
  607. }
  608. } else {
  609. if !models.IsErrJobNotExist(err) {
  610. log.Error("system error, %v", err, ctx.Data["MsgID"])
  611. modelSafetyNewDataPrepare(ctx)
  612. ctx.RenderWithErr("system error", tpname, nil)
  613. return
  614. }
  615. }
  616. if !jobNamePattern.MatchString(jobName) {
  617. modelSafetyNewDataPrepare(ctx)
  618. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  619. return
  620. }
  621. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  622. if err != nil {
  623. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  624. modelSafetyNewDataPrepare(ctx)
  625. ctx.RenderWithErr("system error", tpname, nil)
  626. return
  627. } else {
  628. if count >= 1 {
  629. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  630. modelSafetyNewDataPrepare(ctx)
  631. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  632. return
  633. }
  634. }
  635. BootFile := ctx.Query("boot_file")
  636. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  637. if err != nil || !bootFileExist {
  638. log.Error("Get bootfile error:", err)
  639. modelSafetyNewDataPrepare(ctx)
  640. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  641. return
  642. }
  643. if taskType == models.TypeCloudBrainTwo {
  644. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  645. createForNPU(ctx, jobName)
  646. } else if taskType == models.TypeCloudBrainOne {
  647. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  648. createForGPU(ctx, jobName)
  649. } else if taskType == models.TypeC2Net {
  650. ComputeResource := ctx.Query("compute_resource")
  651. if ComputeResource == models.NPUResource {
  652. createForGrampusNPU(ctx, jobName)
  653. } else if ComputeResource == models.GPUResource {
  654. createForGrampusGPU(ctx, jobName)
  655. }
  656. }
  657. log.Info("to redirect...")
  658. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  659. }
  660. func createForGrampusGPU(ctx *context.Context, jobName string) {
  661. BootFile := ctx.Query("boot_file")
  662. displayJobName := ctx.Query("display_job_name")
  663. description := ctx.Query("description")
  664. image := strings.TrimSpace(ctx.Query("image"))
  665. srcDataset := ctx.Query("src_dataset") //uuid
  666. combatDataset := ctx.Query("combat_dataset") //uuid
  667. evaluationIndex := ctx.Query("evaluation_index")
  668. Params := ctx.Query("run_para_list")
  669. specId := ctx.QueryInt64("spec_id")
  670. TrainUrl := ctx.Query("train_url")
  671. CkptName := ctx.Query("ckpt_name")
  672. ModelName := ctx.Query("model_name")
  673. ModelVersion := ctx.Query("model_version")
  674. repo := ctx.Repo.Repository
  675. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  676. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  677. //check specification
  678. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  679. JobType: models.JobTypeTrain,
  680. ComputeResource: models.GPU,
  681. Cluster: models.C2NetCluster,
  682. })
  683. if err != nil || spec == nil {
  684. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  685. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  686. return
  687. }
  688. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  689. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  690. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  691. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  692. return
  693. }
  694. //check dataset
  695. uuid := srcDataset + ";" + combatDataset
  696. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  697. if err != nil {
  698. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  699. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  700. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  701. return
  702. }
  703. //prepare code and out path
  704. _, err = ioutil.ReadDir(codeLocalPath)
  705. if err == nil {
  706. os.RemoveAll(codeLocalPath)
  707. }
  708. if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  709. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  710. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  711. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  712. return
  713. }
  714. //todo: upload code (send to file_server todo this work?)
  715. //upload code
  716. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  717. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  718. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  719. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  720. return
  721. }
  722. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  723. if err := mkModelPath(modelPath); err != nil {
  724. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  725. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  726. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  727. return
  728. }
  729. //init model readme
  730. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  731. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  732. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  733. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  734. return
  735. }
  736. var datasetRemotePath, allFileName string
  737. for _, datasetInfo := range datasetInfos {
  738. if datasetRemotePath == "" {
  739. datasetRemotePath = datasetInfo.DataLocalPath
  740. allFileName = datasetInfo.FullName
  741. } else {
  742. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  743. allFileName = allFileName + ";" + datasetInfo.FullName
  744. }
  745. }
  746. //prepare command
  747. preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName)
  748. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName)
  749. if err != nil {
  750. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  751. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  752. ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  753. return
  754. }
  755. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  756. req := &grampus.GenerateTrainJobReq{
  757. JobName: jobName,
  758. DisplayJobName: displayJobName,
  759. ComputeResource: models.GPUResource,
  760. ProcessType: grampus.ProcessorTypeGPU,
  761. Command: command,
  762. ImageUrl: image,
  763. Description: description,
  764. BootFile: BootFile,
  765. Uuid: uuid,
  766. CommitID: commitID,
  767. BranchName: cloudbrain.DefaultBranchName,
  768. Params: Params,
  769. EngineName: image,
  770. DatasetNames: datasetNames,
  771. DatasetInfos: datasetInfos,
  772. IsLatestVersion: modelarts.IsLatestVersion,
  773. VersionCount: modelarts.VersionCountOne,
  774. WorkServerNumber: 1,
  775. Spec: spec,
  776. ModelName: ModelName,
  777. LabelName: evaluationIndex,
  778. CkptName: CkptName,
  779. ModelVersion: ModelVersion,
  780. PreTrainModelUrl: TrainUrl,
  781. }
  782. err = grampus.GenerateTrainJob(ctx, req)
  783. if err != nil {
  784. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  785. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  786. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  787. return
  788. }
  789. }
  790. func createForGrampusNPU(ctx *context.Context, jobName string) {
  791. }
  792. func createForNPU(ctx *context.Context, jobName string) {
  793. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  794. BootFile := ctx.Query("boot_file")
  795. displayJobName := ctx.Query("display_job_name")
  796. description := ctx.Query("description")
  797. srcDataset := ctx.Query("src_dataset") //uuid
  798. combatDataset := ctx.Query("combat_dataset") //uuid
  799. evaluationIndex := ctx.Query("evaluation_index")
  800. Params := ctx.Query("run_para_list")
  801. specId := ctx.QueryInt64("spec_id")
  802. engineID := ctx.QueryInt("engine_id")
  803. log.Info("engine_id=" + fmt.Sprint(engineID))
  804. poolID := ctx.Query("pool_id")
  805. repo := ctx.Repo.Repository
  806. trainUrl := ctx.Query("train_url")
  807. modelName := ctx.Query("model_name")
  808. modelVersion := ctx.Query("model_version")
  809. ckptName := ctx.Query("ckpt_name")
  810. ckptUrl := "/" + trainUrl + ckptName
  811. log.Info("ckpt url:" + ckptUrl)
  812. FlavorName := ctx.Query("flaver_names")
  813. EngineName := ctx.Query("engine_names")
  814. isLatestVersion := modelarts.IsLatestVersion
  815. VersionCount := modelarts.VersionCountOne
  816. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  817. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  818. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  819. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  820. log.Info("ckpt url:" + ckptUrl)
  821. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  822. JobType: models.JobTypeInference,
  823. ComputeResource: models.NPU,
  824. Cluster: models.OpenICluster,
  825. AiCenterCode: models.AICenterOfCloudBrainTwo})
  826. if err != nil || spec == nil {
  827. modelSafetyNewDataPrepare(ctx)
  828. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  829. return
  830. }
  831. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  832. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  833. modelSafetyNewDataPrepare(ctx)
  834. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  835. return
  836. }
  837. //todo: del the codeLocalPath
  838. _, err = ioutil.ReadDir(codeLocalPath)
  839. if err == nil {
  840. os.RemoveAll(codeLocalPath)
  841. }
  842. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  843. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  844. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  845. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  846. modelSafetyNewDataPrepare(ctx)
  847. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  848. return
  849. }
  850. //todo: upload code (send to file_server todo this work?)
  851. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  852. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  853. modelSafetyNewDataPrepare(ctx)
  854. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  855. return
  856. }
  857. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  858. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  859. modelSafetyNewDataPrepare(ctx)
  860. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  861. return
  862. }
  863. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  864. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  865. modelSafetyNewDataPrepare(ctx)
  866. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  867. return
  868. }
  869. var parameters models.Parameters
  870. param := make([]models.Parameter, 0)
  871. param = append(param, models.Parameter{
  872. Label: modelarts.ResultUrl,
  873. Value: "s3:/" + resultObsPath,
  874. }, models.Parameter{
  875. Label: modelarts.CkptUrl,
  876. Value: "s3:/" + ckptUrl,
  877. })
  878. uuid := srcDataset + ";" + combatDataset
  879. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  880. if err != nil {
  881. modelSafetyNewDataPrepare(ctx)
  882. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  883. return
  884. }
  885. dataPath := dataUrl
  886. jsondatas, err := json.Marshal(datasUrlList)
  887. if err != nil {
  888. log.Error("Failed to Marshal: %v", err)
  889. modelSafetyNewDataPrepare(ctx)
  890. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  891. return
  892. }
  893. if isMultiDataset {
  894. param = append(param, models.Parameter{
  895. Label: modelarts.MultiDataUrl,
  896. Value: string(jsondatas),
  897. })
  898. }
  899. existDeviceTarget := false
  900. if len(Params) != 0 {
  901. err := json.Unmarshal([]byte(Params), &parameters)
  902. if err != nil {
  903. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  904. modelSafetyNewDataPrepare(ctx)
  905. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  906. return
  907. }
  908. for _, parameter := range parameters.Parameter {
  909. if parameter.Label == modelarts.DeviceTarget {
  910. existDeviceTarget = true
  911. }
  912. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  913. param = append(param, models.Parameter{
  914. Label: parameter.Label,
  915. Value: parameter.Value,
  916. })
  917. }
  918. }
  919. }
  920. if !existDeviceTarget {
  921. param = append(param, models.Parameter{
  922. Label: modelarts.DeviceTarget,
  923. Value: modelarts.Ascend,
  924. })
  925. }
  926. req := &modelarts.GenerateInferenceJobReq{
  927. JobName: jobName,
  928. DisplayJobName: displayJobName,
  929. DataUrl: dataPath,
  930. Description: description,
  931. CodeObsPath: codeObsPath,
  932. BootFileUrl: codeObsPath + BootFile,
  933. BootFile: BootFile,
  934. TrainUrl: trainUrl,
  935. WorkServerNumber: 1,
  936. EngineID: int64(engineID),
  937. LogUrl: logObsPath,
  938. PoolID: poolID,
  939. Uuid: uuid,
  940. Parameters: param, //modelarts train parameters
  941. CommitID: commitID,
  942. BranchName: cloudbrain.DefaultBranchName,
  943. Params: Params,
  944. FlavorName: FlavorName,
  945. EngineName: EngineName,
  946. LabelName: evaluationIndex,
  947. IsLatestVersion: isLatestVersion,
  948. VersionCount: VersionCount,
  949. TotalVersionCount: modelarts.TotalVersionCount,
  950. ModelName: modelName,
  951. ModelVersion: modelVersion,
  952. CkptName: ckptName,
  953. ResultUrl: resultObsPath,
  954. Spec: spec,
  955. DatasetName: datasetNames,
  956. JobType: string(models.JobTypeModelSafety),
  957. }
  958. err = modelarts.GenerateInferenceJob(ctx, req)
  959. if err != nil {
  960. log.Error("GenerateTrainJob failed:%v", err.Error())
  961. modelSafetyNewDataPrepare(ctx)
  962. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  963. return
  964. }
  965. }
  966. func createForGPU(ctx *context.Context, jobName string) {
  967. BootFile := ctx.Query("boot_file")
  968. displayJobName := ctx.Query("display_job_name")
  969. description := ctx.Query("description")
  970. image := strings.TrimSpace(ctx.Query("image"))
  971. srcDataset := ctx.Query("src_dataset") //uuid
  972. combatDataset := ctx.Query("combat_dataset") //uuid
  973. evaluationIndex := ctx.Query("evaluation_index")
  974. Params := ctx.Query("run_para_list")
  975. specId := ctx.QueryInt64("spec_id")
  976. TrainUrl := ctx.Query("train_url")
  977. CkptName := ctx.Query("ckpt_name")
  978. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  979. log.Info("ckpt url:" + ckptUrl)
  980. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  981. JobType: models.JobTypeBenchmark,
  982. ComputeResource: models.GPU,
  983. Cluster: models.OpenICluster,
  984. AiCenterCode: models.AICenterOfCloudBrainOne})
  985. if err != nil || spec == nil {
  986. modelSafetyNewDataPrepare(ctx)
  987. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  988. return
  989. }
  990. repo := ctx.Repo.Repository
  991. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  992. os.RemoveAll(codePath)
  993. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  994. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  995. modelSafetyNewDataPrepare(ctx)
  996. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  997. return
  998. }
  999. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  1000. if err != nil {
  1001. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  1002. modelSafetyNewDataPrepare(ctx)
  1003. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  1004. return
  1005. }
  1006. uuid := srcDataset + ";" + combatDataset
  1007. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  1008. log.Info("uuid=" + uuid)
  1009. if err != nil {
  1010. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  1011. modelSafetyNewDataPrepare(ctx)
  1012. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  1013. return
  1014. }
  1015. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  1016. if err != nil {
  1017. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  1018. modelSafetyNewDataPrepare(ctx)
  1019. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  1020. return
  1021. }
  1022. log.Info("Command=" + command)
  1023. req := cloudbrain.GenerateCloudBrainTaskReq{
  1024. Ctx: ctx,
  1025. DisplayJobName: displayJobName,
  1026. JobName: jobName,
  1027. Image: image,
  1028. Command: command,
  1029. Uuids: uuid,
  1030. DatasetNames: datasetNames,
  1031. DatasetInfos: datasetInfos,
  1032. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  1033. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  1034. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  1035. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  1036. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  1037. JobType: string(models.JobTypeModelSafety),
  1038. Description: description,
  1039. BranchName: cloudbrain.DefaultBranchName,
  1040. BootFile: BootFile,
  1041. Params: Params,
  1042. CommitID: "",
  1043. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  1044. Spec: spec,
  1045. LabelName: evaluationIndex,
  1046. }
  1047. err = cloudbrain.GenerateTask(req)
  1048. if err != nil {
  1049. modelSafetyNewDataPrepare(ctx)
  1050. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  1051. return
  1052. }
  1053. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  1054. }
  1055. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  1056. var command string
  1057. bootFile := strings.TrimSpace(BootFile)
  1058. if !strings.HasSuffix(bootFile, ".py") {
  1059. log.Error("bootFile(%s) format error", bootFile)
  1060. return command, errors.New("bootFile format error")
  1061. }
  1062. var parameters models.Parameters
  1063. var param string
  1064. if len(params) != 0 {
  1065. err := json.Unmarshal([]byte(params), &parameters)
  1066. if err != nil {
  1067. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1068. return command, err
  1069. }
  1070. for _, parameter := range parameters.Parameter {
  1071. param += " --" + parameter.Label + "=" + parameter.Value
  1072. }
  1073. }
  1074. param += " --modelname" + "=" + CkptName
  1075. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  1076. return command, nil
  1077. }
  1078. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  1079. ctx.Data["PageIsCloudBrain"] = true
  1080. ctx.Data["boot_file"] = ctx.Query("boot_file")
  1081. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  1082. ctx.Data["description"] = ctx.Query("description")
  1083. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  1084. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  1085. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  1086. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  1087. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  1088. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  1089. ctx.Data["train_url"] = ctx.Query("train_url")
  1090. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1091. ctx.Data["train_url"] = ctx.Query("train_url")
  1092. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1093. ctx.Data["model_name"] = ctx.Query("model_name")
  1094. ctx.Data["model_version"] = ctx.Query("model_version")
  1095. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  1096. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  1097. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  1098. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  1099. prepareCloudbrainOneSpecs(ctx)
  1100. prepareCloudbrainTwoInferenceSpecs(ctx)
  1101. return nil
  1102. }
  1103. func getJsonContent(url string) (string, error) {
  1104. resp, err := http.Get(url)
  1105. if err != nil || resp.StatusCode != 200 {
  1106. log.Info("Get organizations url error=" + err.Error())
  1107. return "", err
  1108. }
  1109. bytes, err := ioutil.ReadAll(resp.Body)
  1110. resp.Body.Close()
  1111. if err != nil {
  1112. log.Info("Get organizations url error=" + err.Error())
  1113. return "", err
  1114. }
  1115. str := string(bytes)
  1116. //log.Info("json str =" + str)
  1117. return str, nil
  1118. }