You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 40 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/aisafety"
  15. "code.gitea.io/gitea/modules/cloudbrain"
  16. "code.gitea.io/gitea/modules/context"
  17. "code.gitea.io/gitea/modules/git"
  18. "code.gitea.io/gitea/modules/grampus"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/modelarts"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/util"
  25. "code.gitea.io/gitea/services/cloudbrain/resource"
  26. "code.gitea.io/gitea/services/reward/point/account"
  27. uuid "github.com/satori/go.uuid"
  28. )
  29. const (
  30. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  31. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  32. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  33. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  34. tplModelSafetyTestShow = "repo/modelsafety/show"
  35. )
  36. func CloudBrainAiSafetyCreateTest(ctx *context.Context) {
  37. log.Info("start to create CloudBrainAiSafetyCreate")
  38. uuid := uuid.NewV4()
  39. id := uuid.String()
  40. seriaNoParas := ctx.Query("serialNo")
  41. fileName := ctx.Query("fileName")
  42. //if jobType == string(models.JobTypeBenchmark) {
  43. req := aisafety.TaskReq{
  44. UnionId: id,
  45. EvalName: "test1",
  46. EvalContent: "test1",
  47. TLPath: "test1",
  48. Indicators: []string{"ACC", "ASS"},
  49. CDName: "CIFAR10_1000_FGSM",
  50. BDName: "CIFAR10_1000基础数据集",
  51. }
  52. aisafety.GetAlgorithmList()
  53. if seriaNoParas != "" {
  54. aisafety.GetTaskStatus(seriaNoParas)
  55. } else {
  56. jsonStr, err := getJsonContent("http://192.168.207.34:8065/Test_zap1234/openi_aisafety/raw/branch/master/result/" + fileName)
  57. serialNo, err := aisafety.CreateSafetyTask(req, jsonStr)
  58. if err == nil {
  59. log.Info("serialNo=" + serialNo)
  60. time.Sleep(time.Duration(2) * time.Second)
  61. aisafety.GetTaskStatus(serialNo)
  62. } else {
  63. log.Info("CreateSafetyTask error," + err.Error())
  64. }
  65. }
  66. }
  67. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  68. if job == nil {
  69. log.Error("GetCloudbrainByJobID failed")
  70. return
  71. }
  72. syncAiSafetyTaskStatus(job)
  73. }
  74. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  75. ctx.Data["id"] = ctx.Params(":jobid")
  76. ctx.HTML(200, tplModelSafetyTestShow)
  77. }
  78. func GetAiSafetyTask(ctx *context.Context) {
  79. var ID = ctx.Params(":jobid")
  80. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  81. if err != nil {
  82. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  83. return
  84. }
  85. syncAiSafetyTaskStatus(job)
  86. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  87. job.BenchmarkType = "安全评测"
  88. job.BenchmarkTypeName = "Image Classification"
  89. ctx.JSON(200, job)
  90. }
  91. func StopAiSafetyTask(ctx *context.Context) {
  92. var ID = ctx.Params(":jobid")
  93. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  94. result := make(map[string]interface{})
  95. result["code"] = -1
  96. if err != nil {
  97. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  98. result["msg"] = "No such task."
  99. ctx.JSON(200, result)
  100. return
  101. }
  102. if isTaskNotFinished(task.Status) {
  103. if task.Type == models.TypeCloudBrainTwo {
  104. //queryTaskStatusFromCloudbrainTwo(job)
  105. } else if task.Type == models.TypeCloudBrainOne {
  106. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  107. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  108. result["msg"] = "cloudbrain.Already_stopped"
  109. ctx.JSON(200, result)
  110. return
  111. }
  112. err := cloudbrain.StopJob(task.JobID)
  113. if err != nil {
  114. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  115. result["msg"] = "cloudbrain.Stopped_failed"
  116. ctx.JSON(200, result)
  117. return
  118. }
  119. task.Status = string(models.JobStopped)
  120. if task.EndTime == 0 {
  121. task.EndTime = timeutil.TimeStampNow()
  122. }
  123. task.ComputeAndSetDuration()
  124. err = models.UpdateJob(task)
  125. if err != nil {
  126. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  127. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  128. ctx.JSON(200, result)
  129. return
  130. }
  131. }
  132. } else {
  133. if task.Status == string(models.ModelSafetyTesting) {
  134. //修改为Failed
  135. task.Status = string(models.JobStopped)
  136. if task.EndTime == 0 {
  137. task.EndTime = timeutil.TimeStampNow()
  138. }
  139. task.ComputeAndSetDuration()
  140. err = models.UpdateJob(task)
  141. if err != nil {
  142. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  143. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  144. ctx.JSON(200, result)
  145. return
  146. }
  147. } else {
  148. log.Info("The job is finished. status=" + task.Status)
  149. }
  150. }
  151. }
  152. func DelAiSafetyTask(ctx *context.Context) {
  153. var ID = ctx.Params(":jobid")
  154. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  155. result := make(map[string]interface{})
  156. result["code"] = 1
  157. if err != nil {
  158. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  159. result["msg"] = "No such task."
  160. ctx.JSON(200, result)
  161. return
  162. }
  163. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  164. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  165. result["msg"] = "the job(" + task.JobName + ") has not been stopped"
  166. ctx.JSON(200, result)
  167. return
  168. }
  169. if task.Type == models.TypeCloudBrainOne {
  170. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  171. }
  172. err = models.DeleteJob(task)
  173. if err != nil {
  174. result["msg"] = err.Error()
  175. ctx.JSON(200, result)
  176. return
  177. }
  178. result["code"] = 0
  179. result["msg"] = "Succeed"
  180. ctx.JSON(200, result)
  181. }
  182. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  183. if isTaskNotFinished(job.Status) {
  184. if job.Type == models.TypeCloudBrainTwo {
  185. queryTaskStatusFromCloudbrainTwo(job)
  186. } else if job.Type == models.TypeCloudBrainOne {
  187. queryTaskStatusFromCloudbrain(job)
  188. } else if job.Type == models.TypeC2Net {
  189. queryTaskStatusFromGrampus(job)
  190. }
  191. } else {
  192. if job.Status == string(models.ModelSafetyTesting) {
  193. queryTaskStatusFromModelSafetyTestServer(job)
  194. } else {
  195. log.Info("The job is finished. status=" + job.Status)
  196. }
  197. }
  198. }
  199. func TimerHandleModelSafetyTestTask() {
  200. tasks, err := models.GetModelSafetyTestTask()
  201. if err == nil {
  202. if tasks != nil && len(tasks) > 0 {
  203. for _, job := range tasks {
  204. syncAiSafetyTaskStatus(job)
  205. }
  206. } else {
  207. log.Info("query running model safety test task 0.")
  208. }
  209. } else {
  210. log.Info("query running model safety test task err." + err.Error())
  211. }
  212. }
  213. func queryTaskStatusFromGrampus(task *models.Cloudbrain) {
  214. log.Info("The task not finished,name=" + task.DisplayJobName)
  215. if task.DeletedAt.IsZero() { //normal record
  216. result, err := grampus.GetJob(task.JobID)
  217. resultJson, _ := json.Marshal(result)
  218. log.Info("resultJson=" + string(resultJson))
  219. if err != nil {
  220. log.Error("GetJob failed:" + err.Error())
  221. return
  222. }
  223. if result != nil {
  224. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  225. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  226. }
  227. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  228. if task.Status != models.GrampusStatusSucceeded {
  229. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  230. task.Duration = result.JobInfo.RunSec
  231. if task.Duration < 0 {
  232. task.Duration = 0
  233. }
  234. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  235. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  236. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  237. }
  238. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  239. task.EndTime = task.StartTime.Add(task.Duration)
  240. }
  241. task.CorrectCreateUnix()
  242. err = models.UpdateJob(task)
  243. if err != nil {
  244. log.Error("UpdateJob failed:" + err.Error())
  245. }
  246. }
  247. } else {
  248. task.Status = string(models.ModelSafetyTesting)
  249. err = models.UpdateJob(task)
  250. if err != nil {
  251. log.Error("UpdateJob failed:", err)
  252. }
  253. //send msg to beihang
  254. sendGPUInferenceResultToTest(task)
  255. }
  256. }
  257. }
  258. }
  259. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  260. log.Info("The task not finished,name=" + job.DisplayJobName)
  261. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  262. if err != nil {
  263. log.Info("query train job error." + err.Error())
  264. return
  265. }
  266. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  267. job.Duration = result.Duration
  268. job.TrainJobDuration = result.TrainJobDuration
  269. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  270. err = models.UpdateJob(job)
  271. if err != nil {
  272. log.Error("UpdateJob failed:", err)
  273. }
  274. } else {
  275. job.Status = string(models.ModelSafetyTesting)
  276. err = models.UpdateJob(job)
  277. if err != nil {
  278. log.Error("UpdateJob failed:", err)
  279. }
  280. //send msg to beihang
  281. sendNPUInferenceResultToTest(job)
  282. }
  283. }
  284. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  285. datasetname := job.DatasetName
  286. datasetnames := strings.Split(datasetname, ";")
  287. indicator := job.LabelName
  288. req := aisafety.TaskReq{
  289. UnionId: job.JobID,
  290. EvalName: job.DisplayJobName,
  291. EvalContent: job.Description,
  292. TLPath: "test",
  293. Indicators: strings.Split(indicator, ";"),
  294. CDName: datasetnames[1],
  295. BDName: datasetnames[0],
  296. }
  297. jsonContent := ""
  298. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  299. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  300. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  301. if err != nil {
  302. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  303. } else {
  304. defer body.Close()
  305. var data []byte
  306. p := make([]byte, 4096)
  307. var readErr error
  308. var readCount int
  309. for {
  310. readCount, readErr = body.Read(p)
  311. if readCount > 0 {
  312. data = append(data, p[:readCount]...)
  313. }
  314. if readErr != nil || readCount == 0 {
  315. break
  316. }
  317. }
  318. jsonContent = string(data)
  319. }
  320. if jsonContent != "" {
  321. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  322. if err == nil {
  323. //update serial no to db
  324. job.PreVersionName = serialNo
  325. err = models.UpdateJob(job)
  326. if err != nil {
  327. log.Error("UpdateJob failed:", err)
  328. }
  329. }
  330. } else {
  331. log.Info("The json is null. so set it failed.")
  332. //update task failed.
  333. job.Status = string(models.ModelArtsTrainJobFailed)
  334. err := models.UpdateJob(job)
  335. if err != nil {
  336. log.Error("UpdateJob failed:", err)
  337. }
  338. }
  339. }
  340. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  341. log.Info("The task not finished,name=" + job.DisplayJobName)
  342. jobResult, err := cloudbrain.GetJob(job.JobID)
  343. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  344. if err != nil {
  345. log.Error("ConvertToJobResultPayload failed:", err)
  346. return
  347. }
  348. job.Status = result.JobStatus.State
  349. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  350. taskRoles := result.TaskRoles
  351. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  352. job.Status = taskRes.TaskStatuses[0].State
  353. }
  354. if result.JobStatus.State != string(models.JobSucceeded) {
  355. err = models.UpdateJob(job)
  356. if err != nil {
  357. log.Error("UpdateJob failed:", err)
  358. }
  359. } else {
  360. //
  361. job.Status = string(models.ModelSafetyTesting)
  362. err = models.UpdateJob(job)
  363. if err != nil {
  364. log.Error("UpdateJob failed:", err)
  365. }
  366. //send msg to beihang
  367. sendGPUInferenceResultToTest(job)
  368. }
  369. }
  370. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  371. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  372. if err == nil {
  373. if result.Code == "0" {
  374. if result.Data.Status == 1 {
  375. log.Info("The task is running....")
  376. } else {
  377. if result.Data.Code == 0 {
  378. job.ResultJson = result.Data.StandardJson
  379. err = models.UpdateJob(job)
  380. if err != nil {
  381. log.Error("UpdateJob failed:", err)
  382. }
  383. }
  384. }
  385. } else {
  386. log.Info("The task is failed.")
  387. job.Status = string(models.JobFailed)
  388. err = models.UpdateJob(job)
  389. if err != nil {
  390. log.Error("UpdateJob failed:", err)
  391. }
  392. }
  393. } else {
  394. log.Info("The task not found.....")
  395. }
  396. }
  397. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  398. datasetname := job.DatasetName
  399. datasetnames := strings.Split(datasetname, ";")
  400. indicator := job.LabelName
  401. req := aisafety.TaskReq{
  402. UnionId: job.JobID,
  403. EvalName: job.DisplayJobName,
  404. EvalContent: job.Description,
  405. TLPath: "test",
  406. Indicators: strings.Split(indicator, ";"),
  407. CDName: datasetnames[1],
  408. BDName: datasetnames[0],
  409. }
  410. resultDir := "/model"
  411. prefix := "/" + setting.CBCodePathPrefix + job.JobName + resultDir
  412. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  413. if err != nil {
  414. log.Error("query cloudbrain one model failed: %v", err)
  415. return
  416. }
  417. jsonContent := ""
  418. for _, file := range files {
  419. if strings.HasSuffix(file.FileName, "result.json") {
  420. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  421. log.Info("path=" + path)
  422. reader, err := os.Open(path)
  423. defer reader.Close()
  424. if err == nil {
  425. r := bufio.NewReader(reader)
  426. for {
  427. line, error := r.ReadString('\n')
  428. if error == io.EOF {
  429. log.Info("read file completed.")
  430. break
  431. }
  432. if error != nil {
  433. log.Info("read file error." + error.Error())
  434. break
  435. }
  436. jsonContent += line
  437. }
  438. }
  439. break
  440. }
  441. }
  442. if jsonContent != "" {
  443. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  444. if err == nil {
  445. //update serial no to db
  446. job.PreVersionName = serialNo
  447. err = models.UpdateJob(job)
  448. if err != nil {
  449. log.Error("UpdateJob failed:", err)
  450. }
  451. }
  452. } else {
  453. log.Info("The json is null. so set it failed.")
  454. //update task failed.
  455. job.Status = string(models.JobFailed)
  456. err = models.UpdateJob(job)
  457. if err != nil {
  458. log.Error("UpdateJob failed:", err)
  459. }
  460. }
  461. }
  462. func isTaskNotFinished(status string) bool {
  463. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  464. return true
  465. }
  466. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  467. return true
  468. }
  469. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  470. return true
  471. }
  472. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  473. return true
  474. }
  475. return false
  476. }
  477. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  478. t := time.Now()
  479. ctx.Data["PageIsCloudBrain"] = true
  480. ctx.Data["IsCreate"] = true
  481. ctx.Data["type"] = models.TypeCloudBrainOne
  482. ctx.Data["compute_resource"] = models.GPUResource
  483. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  484. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  485. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  486. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  487. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  488. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  489. ctx.Data["display_job_name"] = displayJobName
  490. prepareCloudbrainOneSpecs(ctx)
  491. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  492. if queuesDetail != nil {
  493. ctx.Data["QueuesDetail"] = queuesDetail
  494. }
  495. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  496. }
  497. func AiSafetyCreateForGetGrampusGPU(ctx *context.Context) {
  498. ctx.Data["PageIsCloudBrain"] = true
  499. ctx.Data["IsCreate"] = true
  500. ctx.Data["type"] = models.TypeC2Net
  501. ctx.Data["compute_resource"] = models.GPUResource
  502. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  503. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  504. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  505. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  506. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  507. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  508. if err != nil {
  509. ctx.ServerError("get new train-job info failed", err)
  510. return
  511. }
  512. ctx.HTML(200, tplModelSafetyTestCreateGrampusGpu)
  513. }
  514. func AiSafetyCreateForGetGrampusNPU(ctx *context.Context) {
  515. ctx.Data["PageIsCloudBrain"] = true
  516. ctx.Data["IsCreate"] = true
  517. ctx.Data["type"] = models.TypeC2Net
  518. ctx.Data["compute_resource"] = models.NPUResource
  519. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  520. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  521. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  522. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  523. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  524. err := GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  525. if err != nil {
  526. ctx.ServerError("get new train-job info failed", err)
  527. return
  528. }
  529. ctx.HTML(200, tplModelSafetyTestCreateGrampusNpu)
  530. }
  531. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  532. t := time.Now()
  533. ctx.Data["PageIsCloudBrain"] = true
  534. ctx.Data["IsCreate"] = true
  535. ctx.Data["type"] = models.TypeCloudBrainTwo
  536. ctx.Data["compute_resource"] = models.NPUResource
  537. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  538. ctx.Data["display_job_name"] = displayJobName
  539. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  540. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.BaseDataSetName
  541. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.BaseDataSetUUID
  542. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.CombatDataSetName
  543. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.CombatDataSetUUID
  544. var resourcePools modelarts.ResourcePool
  545. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  546. ctx.ServerError("json.Unmarshal failed:", err)
  547. }
  548. ctx.Data["resource_pools"] = resourcePools.Info
  549. var engines modelarts.Engine
  550. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  551. ctx.ServerError("json.Unmarshal failed:", err)
  552. }
  553. ctx.Data["engines"] = engines.Info
  554. var versionInfos modelarts.VersionInfo
  555. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  556. ctx.ServerError("json.Unmarshal failed:", err)
  557. }
  558. ctx.Data["engine_versions"] = versionInfos.Version
  559. prepareCloudbrainTwoInferenceSpecs(ctx)
  560. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  561. ctx.Data["WaitCount"] = waitCount
  562. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  563. }
  564. func AiSafetyCreateForPost(ctx *context.Context) {
  565. ctx.Data["PageIsCloudBrain"] = true
  566. displayJobName := ctx.Query("display_job_name")
  567. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  568. taskType := ctx.QueryInt("type")
  569. description := ctx.Query("description")
  570. ctx.Data["description"] = description
  571. repo := ctx.Repo.Repository
  572. tpname := tplCloudBrainModelSafetyNewNpu
  573. if taskType == models.TypeCloudBrainOne {
  574. tpname = tplCloudBrainModelSafetyNewGpu
  575. }
  576. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  577. if err == nil {
  578. if len(tasks) != 0 {
  579. log.Error("the job name did already exist", ctx.Data["MsgID"])
  580. modelSafetyNewDataPrepare(ctx)
  581. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  582. return
  583. }
  584. } else {
  585. if !models.IsErrJobNotExist(err) {
  586. log.Error("system error, %v", err, ctx.Data["MsgID"])
  587. modelSafetyNewDataPrepare(ctx)
  588. ctx.RenderWithErr("system error", tpname, nil)
  589. return
  590. }
  591. }
  592. if !jobNamePattern.MatchString(jobName) {
  593. modelSafetyNewDataPrepare(ctx)
  594. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  595. return
  596. }
  597. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  598. if err != nil {
  599. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  600. modelSafetyNewDataPrepare(ctx)
  601. ctx.RenderWithErr("system error", tpname, nil)
  602. return
  603. } else {
  604. if count >= 1 {
  605. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  606. modelSafetyNewDataPrepare(ctx)
  607. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  608. return
  609. }
  610. }
  611. BootFile := ctx.Query("boot_file")
  612. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  613. if err != nil || !bootFileExist {
  614. log.Error("Get bootfile error:", err)
  615. modelSafetyNewDataPrepare(ctx)
  616. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  617. return
  618. }
  619. if taskType == models.TypeCloudBrainTwo {
  620. createForNPU(ctx, jobName)
  621. } else if taskType == models.TypeCloudBrainOne {
  622. createForGPU(ctx, jobName)
  623. } else if taskType == models.TypeC2Net {
  624. ComputeResource := ctx.Query("compute_resource")
  625. if ComputeResource == models.NPUResource {
  626. createForGrampusNPU(ctx, jobName)
  627. } else if ComputeResource == models.GPUResource {
  628. createForGrampusGPU(ctx, jobName)
  629. }
  630. }
  631. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  632. }
  633. func createForGrampusGPU(ctx *context.Context, jobName string) {
  634. BootFile := ctx.Query("boot_file")
  635. displayJobName := ctx.Query("display_job_name")
  636. description := ctx.Query("description")
  637. image := strings.TrimSpace(ctx.Query("image"))
  638. srcDataset := ctx.Query("src_dataset") //uuid
  639. combatDataset := ctx.Query("combat_dataset") //uuid
  640. evaluationIndex := ctx.Query("evaluation_index")
  641. Params := ctx.Query("run_para_list")
  642. specId := ctx.QueryInt64("spec_id")
  643. TrainUrl := ctx.Query("train_url")
  644. CkptName := ctx.Query("ckpt_name")
  645. ModelName := ctx.Query("ModelName")
  646. ModelVersion := ctx.Query("ModelVersion")
  647. repo := ctx.Repo.Repository
  648. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  649. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  650. //check specification
  651. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  652. JobType: models.JobTypeTrain,
  653. ComputeResource: models.GPU,
  654. Cluster: models.C2NetCluster,
  655. })
  656. if err != nil || spec == nil {
  657. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  658. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  659. return
  660. }
  661. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  662. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  663. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  664. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  665. return
  666. }
  667. //check dataset
  668. uuid := srcDataset + ";" + combatDataset
  669. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  670. if err != nil {
  671. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  672. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  673. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  674. return
  675. }
  676. //prepare code and out path
  677. _, err = ioutil.ReadDir(codeLocalPath)
  678. if err == nil {
  679. os.RemoveAll(codeLocalPath)
  680. }
  681. if err := downloadZipCode(ctx, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  682. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  683. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  684. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  685. return
  686. }
  687. //todo: upload code (send to file_server todo this work?)
  688. //upload code
  689. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  690. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  691. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  692. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  693. return
  694. }
  695. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  696. if err := mkModelPath(modelPath); err != nil {
  697. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  698. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  699. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  700. return
  701. }
  702. //init model readme
  703. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  704. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  705. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  706. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  707. return
  708. }
  709. var datasetRemotePath, allFileName string
  710. for _, datasetInfo := range datasetInfos {
  711. if datasetRemotePath == "" {
  712. datasetRemotePath = datasetInfo.DataLocalPath
  713. allFileName = datasetInfo.FullName
  714. } else {
  715. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  716. allFileName = allFileName + ";" + datasetInfo.FullName
  717. }
  718. }
  719. //prepare command
  720. preTrainModelPath := getPreTrainModelPath(TrainUrl, CkptName)
  721. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, BootFile, Params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, CkptName)
  722. if err != nil {
  723. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  724. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  725. ctx.RenderWithErr("Create task failed, internal error", tplCloudBrainModelSafetyNewGrampusGpu, nil)
  726. return
  727. }
  728. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  729. req := &grampus.GenerateTrainJobReq{
  730. JobName: jobName,
  731. DisplayJobName: displayJobName,
  732. ComputeResource: models.GPUResource,
  733. ProcessType: grampus.ProcessorTypeGPU,
  734. Command: command,
  735. ImageUrl: image,
  736. Description: description,
  737. BootFile: BootFile,
  738. Uuid: uuid,
  739. CommitID: commitID,
  740. BranchName: cloudbrain.DefaultBranchName,
  741. Params: Params,
  742. EngineName: image,
  743. DatasetNames: datasetNames,
  744. DatasetInfos: datasetInfos,
  745. IsLatestVersion: modelarts.IsLatestVersion,
  746. VersionCount: modelarts.VersionCountOne,
  747. WorkServerNumber: 1,
  748. Spec: spec,
  749. ModelName: ModelName,
  750. LabelName: evaluationIndex,
  751. CkptName: CkptName,
  752. ModelVersion: ModelVersion,
  753. PreTrainModelUrl: TrainUrl,
  754. }
  755. err = grampus.GenerateTrainJob(ctx, req)
  756. if err != nil {
  757. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  758. GrampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  759. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewGrampusGpu, nil)
  760. return
  761. }
  762. }
  763. func createForGrampusNPU(ctx *context.Context, jobName string) {
  764. }
  765. func createForNPU(ctx *context.Context, jobName string) {
  766. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  767. BootFile := ctx.Query("boot_file")
  768. displayJobName := ctx.Query("display_job_name")
  769. description := ctx.Query("description")
  770. srcDataset := ctx.Query("src_dataset") //uuid
  771. combatDataset := ctx.Query("combat_dataset") //uuid
  772. evaluationIndex := ctx.Query("evaluation_index")
  773. Params := ctx.Query("run_para_list")
  774. specId := ctx.QueryInt64("spec_id")
  775. engineID := ctx.QueryInt("EngineID")
  776. poolID := ctx.Query("PoolID")
  777. repo := ctx.Repo.Repository
  778. trainUrl := ctx.Query("train_url")
  779. modelName := ctx.Query("ModelName")
  780. modelVersion := ctx.Query("ModelVersion")
  781. ckptName := ctx.Query("ckpt_name")
  782. ckptUrl := "/" + trainUrl + ckptName
  783. log.Info("ckpt url:" + ckptUrl)
  784. FlavorName := ctx.Query("FlavorName")
  785. EngineName := ctx.Query("EngineName")
  786. isLatestVersion := modelarts.IsLatestVersion
  787. VersionCount := modelarts.VersionCountOne
  788. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  789. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  790. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  791. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  792. log.Info("ckpt url:" + ckptUrl)
  793. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  794. JobType: models.JobTypeInference,
  795. ComputeResource: models.NPU,
  796. Cluster: models.OpenICluster,
  797. AiCenterCode: models.AICenterOfCloudBrainTwo})
  798. if err != nil || spec == nil {
  799. modelSafetyNewDataPrepare(ctx)
  800. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  801. return
  802. }
  803. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  804. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  805. modelSafetyNewDataPrepare(ctx)
  806. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplCloudBrainModelSafetyNewNpu, nil)
  807. return
  808. }
  809. //todo: del the codeLocalPath
  810. _, err = ioutil.ReadDir(codeLocalPath)
  811. if err == nil {
  812. os.RemoveAll(codeLocalPath)
  813. }
  814. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  815. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  816. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  817. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  818. modelSafetyNewDataPrepare(ctx)
  819. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  820. return
  821. }
  822. //todo: upload code (send to file_server todo this work?)
  823. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  824. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  825. modelSafetyNewDataPrepare(ctx)
  826. ctx.RenderWithErr("Failed to obsMkdir_result", tplCloudBrainModelSafetyNewNpu, nil)
  827. return
  828. }
  829. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  830. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  831. modelSafetyNewDataPrepare(ctx)
  832. ctx.RenderWithErr("Failed to obsMkdir_log", tplCloudBrainModelSafetyNewNpu, nil)
  833. return
  834. }
  835. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  836. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  837. modelSafetyNewDataPrepare(ctx)
  838. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplCloudBrainModelSafetyNewNpu, nil)
  839. return
  840. }
  841. var parameters models.Parameters
  842. param := make([]models.Parameter, 0)
  843. param = append(param, models.Parameter{
  844. Label: modelarts.ResultUrl,
  845. Value: "s3:/" + resultObsPath,
  846. }, models.Parameter{
  847. Label: modelarts.CkptUrl,
  848. Value: "s3:/" + ckptUrl,
  849. })
  850. uuid := srcDataset + ";" + combatDataset
  851. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  852. if err != nil {
  853. modelSafetyNewDataPrepare(ctx)
  854. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  855. return
  856. }
  857. dataPath := dataUrl
  858. jsondatas, err := json.Marshal(datasUrlList)
  859. if err != nil {
  860. log.Error("Failed to Marshal: %v", err)
  861. modelSafetyNewDataPrepare(ctx)
  862. ctx.RenderWithErr("json error:"+err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  863. return
  864. }
  865. if isMultiDataset {
  866. param = append(param, models.Parameter{
  867. Label: modelarts.MultiDataUrl,
  868. Value: string(jsondatas),
  869. })
  870. }
  871. existDeviceTarget := false
  872. if len(Params) != 0 {
  873. err := json.Unmarshal([]byte(Params), &parameters)
  874. if err != nil {
  875. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  876. modelSafetyNewDataPrepare(ctx)
  877. ctx.RenderWithErr("运行参数错误", tplCloudBrainModelSafetyNewNpu, nil)
  878. return
  879. }
  880. for _, parameter := range parameters.Parameter {
  881. if parameter.Label == modelarts.DeviceTarget {
  882. existDeviceTarget = true
  883. }
  884. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  885. param = append(param, models.Parameter{
  886. Label: parameter.Label,
  887. Value: parameter.Value,
  888. })
  889. }
  890. }
  891. }
  892. if !existDeviceTarget {
  893. param = append(param, models.Parameter{
  894. Label: modelarts.DeviceTarget,
  895. Value: modelarts.Ascend,
  896. })
  897. }
  898. req := &modelarts.GenerateInferenceJobReq{
  899. JobName: jobName,
  900. DisplayJobName: displayJobName,
  901. DataUrl: dataPath,
  902. Description: description,
  903. CodeObsPath: codeObsPath,
  904. BootFileUrl: codeObsPath + BootFile,
  905. BootFile: BootFile,
  906. TrainUrl: trainUrl,
  907. WorkServerNumber: 1,
  908. EngineID: int64(engineID),
  909. LogUrl: logObsPath,
  910. PoolID: poolID,
  911. Uuid: uuid,
  912. Parameters: param, //modelarts train parameters
  913. CommitID: commitID,
  914. BranchName: cloudbrain.DefaultBranchName,
  915. Params: Params,
  916. FlavorName: FlavorName,
  917. EngineName: EngineName,
  918. LabelName: evaluationIndex,
  919. IsLatestVersion: isLatestVersion,
  920. VersionCount: VersionCount,
  921. TotalVersionCount: modelarts.TotalVersionCount,
  922. ModelName: modelName,
  923. ModelVersion: modelVersion,
  924. CkptName: ckptName,
  925. ResultUrl: resultObsPath,
  926. Spec: spec,
  927. DatasetName: datasetNames,
  928. JobType: string(models.JobTypeModelSafety),
  929. }
  930. err = modelarts.GenerateInferenceJob(ctx, req)
  931. if err != nil {
  932. log.Error("GenerateTrainJob failed:%v", err.Error())
  933. modelSafetyNewDataPrepare(ctx)
  934. ctx.RenderWithErr(err.Error(), tplCloudBrainModelSafetyNewNpu, nil)
  935. return
  936. }
  937. }
  938. func createForGPU(ctx *context.Context, jobName string) {
  939. BootFile := ctx.Query("boot_file")
  940. displayJobName := ctx.Query("display_job_name")
  941. description := ctx.Query("description")
  942. image := strings.TrimSpace(ctx.Query("image"))
  943. srcDataset := ctx.Query("src_dataset") //uuid
  944. combatDataset := ctx.Query("combat_dataset") //uuid
  945. evaluationIndex := ctx.Query("evaluation_index")
  946. Params := ctx.Query("run_para_list")
  947. specId := ctx.QueryInt64("spec_id")
  948. TrainUrl := ctx.Query("train_url")
  949. CkptName := ctx.Query("ckpt_name")
  950. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  951. log.Info("ckpt url:" + ckptUrl)
  952. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  953. JobType: models.JobTypeBenchmark,
  954. ComputeResource: models.GPU,
  955. Cluster: models.OpenICluster,
  956. AiCenterCode: models.AICenterOfCloudBrainOne})
  957. if err != nil || spec == nil {
  958. modelSafetyNewDataPrepare(ctx)
  959. ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewGpu, nil)
  960. return
  961. }
  962. repo := ctx.Repo.Repository
  963. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  964. os.RemoveAll(codePath)
  965. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  966. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  967. modelSafetyNewDataPrepare(ctx)
  968. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  969. return
  970. }
  971. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  972. if err != nil {
  973. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  974. modelSafetyNewDataPrepare(ctx)
  975. ctx.RenderWithErr("system error", tplCloudBrainModelSafetyNewGpu, nil)
  976. return
  977. }
  978. uuid := srcDataset + ";" + combatDataset
  979. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  980. log.Info("uuid=" + uuid)
  981. if err != nil {
  982. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  983. modelSafetyNewDataPrepare(ctx)
  984. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil)
  985. return
  986. }
  987. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  988. if err != nil {
  989. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  990. modelSafetyNewDataPrepare(ctx)
  991. //ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tplCloudBrainBenchmarkNew, nil) TODO
  992. return
  993. }
  994. log.Info("Command=" + command)
  995. req := cloudbrain.GenerateCloudBrainTaskReq{
  996. Ctx: ctx,
  997. DisplayJobName: displayJobName,
  998. JobName: jobName,
  999. Image: image,
  1000. Command: command,
  1001. Uuids: uuid,
  1002. DatasetNames: datasetNames,
  1003. DatasetInfos: datasetInfos,
  1004. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  1005. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  1006. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  1007. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  1008. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  1009. JobType: string(models.JobTypeModelSafety),
  1010. Description: description,
  1011. BranchName: cloudbrain.DefaultBranchName,
  1012. BootFile: BootFile,
  1013. Params: Params,
  1014. CommitID: "",
  1015. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  1016. Spec: spec,
  1017. LabelName: evaluationIndex,
  1018. }
  1019. err = cloudbrain.GenerateTask(req)
  1020. if err != nil {
  1021. modelSafetyNewDataPrepare(ctx)
  1022. ctx.RenderWithErr(err.Error(), tplCloudBrainBenchmarkNew, nil)
  1023. return
  1024. }
  1025. //ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/modelsafety_test")
  1026. }
  1027. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  1028. var command string
  1029. bootFile := strings.TrimSpace(BootFile)
  1030. if !strings.HasSuffix(bootFile, ".py") {
  1031. log.Error("bootFile(%s) format error", bootFile)
  1032. return command, errors.New("bootFile format error")
  1033. }
  1034. var parameters models.Parameters
  1035. var param string
  1036. if len(params) != 0 {
  1037. err := json.Unmarshal([]byte(params), &parameters)
  1038. if err != nil {
  1039. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  1040. return command, err
  1041. }
  1042. for _, parameter := range parameters.Parameter {
  1043. param += " --" + parameter.Label + "=" + parameter.Value
  1044. }
  1045. }
  1046. param += " --modelname" + "=" + CkptName
  1047. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  1048. return command, nil
  1049. }
  1050. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  1051. ctx.Data["PageIsCloudBrain"] = true
  1052. ctx.Data["boot_file"] = ctx.Query("boot_file")
  1053. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  1054. ctx.Data["description"] = ctx.Query("description")
  1055. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  1056. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  1057. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  1058. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  1059. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  1060. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  1061. ctx.Data["train_url"] = ctx.Query("train_url")
  1062. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  1063. prepareCloudbrainOneSpecs(ctx)
  1064. return nil
  1065. }
  1066. func getJsonContent(url string) (string, error) {
  1067. resp, err := http.Get(url)
  1068. if err != nil || resp.StatusCode != 200 {
  1069. log.Info("Get organizations url error=" + err.Error())
  1070. return "", err
  1071. }
  1072. bytes, err := ioutil.ReadAll(resp.Body)
  1073. resp.Body.Close()
  1074. if err != nil {
  1075. log.Info("Get organizations url error=" + err.Error())
  1076. return "", err
  1077. }
  1078. str := string(bytes)
  1079. //log.Info("json str =" + str)
  1080. return str, nil
  1081. }