diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py index 4e959a17..8d13e745 100644 --- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py +++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py @@ -42,7 +42,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel): self.max_frames = model_config['max_frames'] self.feature_framerate = model_config['feature_framerate'] self.image_resolution = 224 - self.device = model_config['device'] + if torch.cuda.is_available(): + self.device = model_config['device'] + else: + self.device = 'cpu' self.init_model = f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}' self.tokenizer = ClipTokenizer(model_dir) diff --git a/modelscope/models/multi_modal/mmr/models/modeling.py b/modelscope/models/multi_modal/mmr/models/modeling.py index 214e65c7..21cc4c80 100644 --- a/modelscope/models/multi_modal/mmr/models/modeling.py +++ b/modelscope/models/multi_modal/mmr/models/modeling.py @@ -85,9 +85,6 @@ class CLIP4Clip(nn.Module): linear_patch=config['linear_patch'], use_gc=config['use_gc']).float() - if (platform.system() != 'Darwin'): - convert_weights(self.clip) # fp16 - if backbone in ['ViT-B/32', 'ViT-B/16']: cross_config = SimpleNamespace(**{ 'hidden_size': 512,