You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 15 kB

5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
5 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
5 years ago
3 years ago
5 years ago
3 years ago
5 years ago
5 years ago
3 years ago
5 years ago
5 years ago
3 years ago
5 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
5 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. package cloudbrain
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "strconv"
  6. "code.gitea.io/gitea/modules/timeutil"
  7. "code.gitea.io/gitea/modules/storage"
  8. "code.gitea.io/gitea/models"
  9. "code.gitea.io/gitea/modules/context"
  10. "code.gitea.io/gitea/modules/log"
  11. "code.gitea.io/gitea/modules/notification"
  12. "code.gitea.io/gitea/modules/setting"
  13. )
  14. const (
  15. Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"`
  16. //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"`
  17. CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"`
  18. CodeMountPath = "/code"
  19. DataSetMountPath = "/dataset"
  20. ModelMountPath = "/model"
  21. LogFile = "log.txt"
  22. BenchMarkMountPath = "/benchmark"
  23. BenchMarkResourceID = 1
  24. Snn4imagenetMountPath = "/snn4imagenet"
  25. BrainScoreMountPath = "/brainscore"
  26. TaskInfoName = "/taskInfo"
  27. Snn4imagenetCommand = `/opt/conda/bin/python /snn4imagenet/testSNN_script.py --modelname '%s' --modelpath '/dataset' --modeldescription '%s'`
  28. BrainScoreCommand = `bash /brainscore/brainscore_test_par4shSrcipt.sh -b '%s' -n '%s' -p '/dataset' -d '%s'`
  29. SubTaskName = "task1"
  30. Success = "S000"
  31. DefaultBranchName = "master"
  32. )
  33. var (
  34. ResourceSpecs *models.ResourceSpecs
  35. TrainResourceSpecs *models.ResourceSpecs
  36. )
  37. type GenerateCloudBrainTaskReq struct {
  38. Ctx *context.Context
  39. DisplayJobName string
  40. JobName string
  41. Image string
  42. Command string
  43. Uuids string
  44. CodePath string
  45. ModelPath string
  46. BenchmarkPath string
  47. Snn4ImageNetPath string
  48. BrainScorePath string
  49. JobType string
  50. GpuQueue string
  51. Description string
  52. BranchName string
  53. BootFile string
  54. Params string
  55. CommitID string
  56. DataLocalPath string
  57. BenchmarkTypeID int
  58. BenchmarkChildTypeID int
  59. ResourceSpecId int
  60. }
  61. func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  62. if !ctx.IsSigned {
  63. return false
  64. }
  65. log.Info("is repo owner:" + strconv.FormatBool(ctx.IsUserRepoOwner()))
  66. log.Info("is user admin:" + strconv.FormatBool(ctx.IsUserSiteAdmin()))
  67. if err != nil {
  68. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
  69. } else {
  70. log.Info("is job creator:" + strconv.FormatBool(ctx.User.ID == job.UserID))
  71. return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  72. }
  73. }
  74. func CanDeleteJob(ctx *context.Context, job *models.Cloudbrain) bool {
  75. return isAdminOrOwnerOrJobCreater(ctx, job, nil)
  76. }
  77. func CanCreateOrDebugJob(ctx *context.Context) bool {
  78. if !ctx.IsSigned {
  79. return false
  80. }
  81. return ctx.Repo.CanWrite(models.UnitTypeCloudBrain)
  82. }
  83. func CanModifyJob(ctx *context.Context, job *models.Cloudbrain) bool {
  84. return isAdminOrJobCreater(ctx, job, nil)
  85. }
  86. func isAdminOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool {
  87. if !ctx.IsSigned {
  88. return false
  89. }
  90. if err != nil {
  91. return ctx.IsUserSiteAdmin()
  92. } else {
  93. return ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
  94. }
  95. }
  96. func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) bool {
  97. if !ctx.IsSigned {
  98. return false
  99. }
  100. if err != nil {
  101. return ctx.IsUserSiteAdmin()
  102. } else {
  103. return ctx.IsUserSiteAdmin() || ctx.User.ID == image.UID
  104. }
  105. }
  106. func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {
  107. var ID = ctx.Params(":id")
  108. job, err := models.GetCloudbrainByID(ID)
  109. if err != nil {
  110. log.Error("GetCloudbrainByID failed:%v", err.Error())
  111. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  112. }
  113. ctx.Cloudbrain = job
  114. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  115. log.Error("!isAdminOrOwnerOrJobCreater error:%v", err.Error())
  116. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  117. }
  118. }
  119. func AdminOrJobCreaterRight(ctx *context.Context) {
  120. var ID = ctx.Params(":id")
  121. job, err := models.GetCloudbrainByID(ID)
  122. if err != nil {
  123. log.Error("GetCloudbrainByID failed:%v", err.Error())
  124. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  125. }
  126. ctx.Cloudbrain = job
  127. if !isAdminOrJobCreater(ctx, job, err) {
  128. log.Error("!isAdminOrJobCreater error:%v", err.Error())
  129. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  130. }
  131. }
  132. func AdminOrOwnerOrJobCreaterRightForTrain(ctx *context.Context) {
  133. var jobID = ctx.Params(":jobid")
  134. job, err := models.GetCloudbrainByJobID(jobID)
  135. if err != nil {
  136. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  137. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  138. }
  139. ctx.Cloudbrain = job
  140. if !isAdminOrOwnerOrJobCreater(ctx, job, err) {
  141. log.Error("!isAdminOrOwnerOrJobCreater failed:%v", err.Error())
  142. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  143. }
  144. }
  145. func AdminOrJobCreaterRightForTrain(ctx *context.Context) {
  146. var jobID = ctx.Params(":jobid")
  147. job, err := models.GetCloudbrainByJobID(jobID)
  148. if err != nil {
  149. log.Error("GetCloudbrainByJobID failed:%v", err.Error())
  150. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  151. }
  152. ctx.Cloudbrain = job
  153. if !isAdminOrJobCreater(ctx, job, err) {
  154. log.Error("!isAdminOrJobCreater errot:%v", err.Error())
  155. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  156. }
  157. }
  158. func AdminOrImageCreaterRight(ctx *context.Context) {
  159. id, err := strconv.ParseInt(ctx.Params(":id"), 10, 64)
  160. var image *models.Image
  161. if err != nil {
  162. log.Error("Get Image by ID failed:%v", err.Error())
  163. } else {
  164. image, err = models.GetImageByID(id)
  165. if err != nil {
  166. log.Error("Get Image by ID failed:%v", err.Error())
  167. return
  168. }
  169. }
  170. if !isAdminOrImageCreater(ctx, image, err) {
  171. log.Error("!isAdminOrImageCreater error:%v", err.Error())
  172. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  173. }
  174. }
  175. func GenerateTask(req GenerateCloudBrainTaskReq) error {
  176. dataActualPath := setting.Attachment.Minio.RealPath +
  177. setting.Attachment.Minio.Bucket + "/" +
  178. setting.Attachment.Minio.BasePath +
  179. models.AttachmentRelativePath(req.Uuids) +
  180. req.Uuids
  181. var resourceSpec *models.ResourceSpec
  182. var versionCount int
  183. if req.JobType == string(models.JobTypeTrain) {
  184. versionCount = 1
  185. if TrainResourceSpecs == nil {
  186. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  187. }
  188. for _, spec := range TrainResourceSpecs.ResourceSpec {
  189. if req.ResourceSpecId == spec.Id {
  190. resourceSpec = spec
  191. }
  192. }
  193. } else {
  194. if ResourceSpecs == nil {
  195. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  196. }
  197. for _, spec := range ResourceSpecs.ResourceSpec {
  198. if req.ResourceSpecId == spec.Id {
  199. resourceSpec = spec
  200. }
  201. }
  202. }
  203. if resourceSpec == nil {
  204. log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
  205. return errors.New("no such resourceSpec")
  206. }
  207. var datasetName string
  208. attach, err := models.GetAttachmentByUUID(req.Uuids)
  209. if err != nil {
  210. //for benchmark, do not return error
  211. log.Error("GetAttachmentByUUID failed:%v", err, req.Ctx.Data["MsgID"])
  212. } else {
  213. datasetName = attach.Name
  214. }
  215. createTime := timeutil.TimeStampNow()
  216. jobResult, err := CreateJob(req.JobName, models.CreateJobParams{
  217. JobName: req.JobName,
  218. RetryCount: 1,
  219. GpuType: req.GpuQueue,
  220. Image: req.Image,
  221. TaskRoles: []models.TaskRole{
  222. {
  223. Name: SubTaskName,
  224. TaskNumber: 1,
  225. MinSucceededTaskCount: 1,
  226. MinFailedTaskCount: 1,
  227. CPUNumber: resourceSpec.CpuNum,
  228. GPUNumber: resourceSpec.GpuNum,
  229. MemoryMB: resourceSpec.MemMiB,
  230. ShmMB: resourceSpec.ShareMemMiB,
  231. Command: req.Command,
  232. NeedIBDevice: false,
  233. IsMainRole: false,
  234. UseNNI: false,
  235. },
  236. },
  237. Volumes: []models.Volume{
  238. {
  239. HostPath: models.StHostPath{
  240. Path: req.CodePath,
  241. MountPath: CodeMountPath,
  242. ReadOnly: false,
  243. },
  244. },
  245. {
  246. HostPath: models.StHostPath{
  247. Path: dataActualPath,
  248. MountPath: DataSetMountPath,
  249. ReadOnly: true,
  250. },
  251. },
  252. {
  253. HostPath: models.StHostPath{
  254. Path: req.ModelPath,
  255. MountPath: ModelMountPath,
  256. ReadOnly: false,
  257. },
  258. },
  259. {
  260. HostPath: models.StHostPath{
  261. Path: req.BenchmarkPath,
  262. MountPath: BenchMarkMountPath,
  263. ReadOnly: true,
  264. },
  265. },
  266. {
  267. HostPath: models.StHostPath{
  268. Path: req.Snn4ImageNetPath,
  269. MountPath: Snn4imagenetMountPath,
  270. ReadOnly: true,
  271. },
  272. },
  273. {
  274. HostPath: models.StHostPath{
  275. Path: req.BrainScorePath,
  276. MountPath: BrainScoreMountPath,
  277. ReadOnly: true,
  278. },
  279. },
  280. },
  281. })
  282. if err != nil {
  283. log.Error("CreateJob failed:", err.Error(), req.Ctx.Data["MsgID"])
  284. return err
  285. }
  286. if jobResult.Code != Success {
  287. log.Error("CreateJob(%s) failed:%s", req.JobName, jobResult.Msg, req.Ctx.Data["MsgID"])
  288. return errors.New(jobResult.Msg)
  289. }
  290. var jobID = jobResult.Payload["jobId"].(string)
  291. err = models.CreateCloudbrain(&models.Cloudbrain{
  292. Status: string(models.JobWaiting),
  293. UserID: req.Ctx.User.ID,
  294. RepoID: req.Ctx.Repo.Repository.ID,
  295. JobID: jobID,
  296. JobName: req.JobName,
  297. DisplayJobName: req.DisplayJobName,
  298. SubTaskName: SubTaskName,
  299. JobType: req.JobType,
  300. Type: models.TypeCloudBrainOne,
  301. Uuid: req.Uuids,
  302. Image: req.Image,
  303. GpuQueue: req.GpuQueue,
  304. ResourceSpecId: req.ResourceSpecId,
  305. ComputeResource: models.GPUResource,
  306. BenchmarkTypeID: req.BenchmarkTypeID,
  307. BenchmarkChildTypeID: req.BenchmarkChildTypeID,
  308. Description: req.Description,
  309. IsLatestVersion: "1",
  310. VersionCount: versionCount,
  311. BranchName: req.BranchName,
  312. BootFile: req.BootFile,
  313. DatasetName: datasetName,
  314. Parameters: req.Params,
  315. CreatedUnix: createTime,
  316. UpdatedUnix: createTime,
  317. CommitID: req.CommitID,
  318. })
  319. if err != nil {
  320. return err
  321. }
  322. task, err := models.GetCloudbrainByJobID(jobID)
  323. if err != nil {
  324. log.Error("GetCloudbrainByJobID failed: %v", err.Error())
  325. return err
  326. }
  327. stringId := strconv.FormatInt(task.ID, 10)
  328. if IsBenchmarkJob(req.JobType) {
  329. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateBenchMarkTask)
  330. } else if string(models.JobTypeTrain) == req.JobType {
  331. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateGPUTrainTask)
  332. } else {
  333. notification.NotifyOtherTask(req.Ctx.User, req.Ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugGPUTask)
  334. }
  335. return nil
  336. }
  337. func IsBenchmarkJob(jobType string) bool {
  338. return string(models.JobTypeBenchmark) == jobType || string(models.JobTypeBrainScore) == jobType || string(models.JobTypeSnn4imagenet) == jobType
  339. }
  340. func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
  341. dataActualPath := setting.Attachment.Minio.RealPath +
  342. setting.Attachment.Minio.Bucket + "/" +
  343. setting.Attachment.Minio.BasePath +
  344. models.AttachmentRelativePath(task.Uuid) +
  345. task.Uuid
  346. jobName := task.JobName
  347. var resourceSpec *models.ResourceSpec
  348. if ResourceSpecs == nil {
  349. json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
  350. }
  351. for _, spec := range ResourceSpecs.ResourceSpec {
  352. if task.ResourceSpecId == spec.Id {
  353. resourceSpec = spec
  354. }
  355. }
  356. if resourceSpec == nil {
  357. log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
  358. return errors.New("no such resourceSpec")
  359. }
  360. createTime := timeutil.TimeStampNow()
  361. jobResult, err := CreateJob(jobName, models.CreateJobParams{
  362. JobName: jobName,
  363. RetryCount: 1,
  364. GpuType: task.GpuQueue,
  365. Image: task.Image,
  366. TaskRoles: []models.TaskRole{
  367. {
  368. Name: SubTaskName,
  369. TaskNumber: 1,
  370. MinSucceededTaskCount: 1,
  371. MinFailedTaskCount: 1,
  372. CPUNumber: resourceSpec.CpuNum,
  373. GPUNumber: resourceSpec.GpuNum,
  374. MemoryMB: resourceSpec.MemMiB,
  375. ShmMB: resourceSpec.ShareMemMiB,
  376. Command: Command,
  377. NeedIBDevice: false,
  378. IsMainRole: false,
  379. UseNNI: false,
  380. },
  381. },
  382. Volumes: []models.Volume{
  383. {
  384. HostPath: models.StHostPath{
  385. Path: storage.GetMinioPath(jobName, CodeMountPath+"/"),
  386. MountPath: CodeMountPath,
  387. ReadOnly: false,
  388. },
  389. },
  390. {
  391. HostPath: models.StHostPath{
  392. Path: dataActualPath,
  393. MountPath: DataSetMountPath,
  394. ReadOnly: true,
  395. },
  396. },
  397. {
  398. HostPath: models.StHostPath{
  399. Path: storage.GetMinioPath(jobName, ModelMountPath+"/"),
  400. MountPath: ModelMountPath,
  401. ReadOnly: false,
  402. },
  403. },
  404. {
  405. HostPath: models.StHostPath{
  406. Path: storage.GetMinioPath(jobName, BenchMarkMountPath+"/"),
  407. MountPath: BenchMarkMountPath,
  408. ReadOnly: true,
  409. },
  410. },
  411. {
  412. HostPath: models.StHostPath{
  413. Path: storage.GetMinioPath(jobName, Snn4imagenetMountPath+"/"),
  414. MountPath: Snn4imagenetMountPath,
  415. ReadOnly: true,
  416. },
  417. },
  418. {
  419. HostPath: models.StHostPath{
  420. Path: storage.GetMinioPath(jobName, BrainScoreMountPath+"/"),
  421. MountPath: BrainScoreMountPath,
  422. ReadOnly: true,
  423. },
  424. },
  425. },
  426. })
  427. if err != nil {
  428. log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"])
  429. return err
  430. }
  431. if jobResult.Code != Success {
  432. log.Error("CreateJob(%s) failed:%s", jobName, jobResult.Msg, ctx.Data["MsgID"])
  433. return errors.New(jobResult.Msg)
  434. }
  435. var jobID = jobResult.Payload["jobId"].(string)
  436. newTask := &models.Cloudbrain{
  437. Status: string(models.JobWaiting),
  438. UserID: task.UserID,
  439. RepoID: task.RepoID,
  440. JobID: jobID,
  441. JobName: task.JobName,
  442. DisplayJobName: task.DisplayJobName,
  443. SubTaskName: task.SubTaskName,
  444. JobType: task.JobType,
  445. Type: task.Type,
  446. Uuid: task.Uuid,
  447. Image: task.Image,
  448. GpuQueue: task.GpuQueue,
  449. ResourceSpecId: task.ResourceSpecId,
  450. ComputeResource: task.ComputeResource,
  451. CreatedUnix: createTime,
  452. UpdatedUnix: createTime,
  453. BranchName: task.BranchName,
  454. }
  455. err = models.RestartCloudbrain(task, newTask)
  456. if err != nil {
  457. log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"])
  458. return err
  459. }
  460. stringId := strconv.FormatInt(newTask.ID, 10)
  461. *newID = stringId
  462. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, task.DisplayJobName, models.ActionCreateDebugGPUTask)
  463. return nil
  464. }