Update padding implementation reduced memory footprint

Browse files

Files changed (1) hide show

modeling_qwen3_vl_nemotron_embed.py +41 -22

modeling_qwen3_vl_nemotron_embed.py CHANGED Viewed

@@ -94,6 +94,42 @@ def _create_bidirectional_mask(
         return None
 class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
     """Bidirectional text model for Qwen3VLNemotronEmbed."""
@@ -247,7 +283,7 @@ class EmbeddingMixin:
             Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
         """
         device = next(self.parameters()).device
-        qs = []
         message = "query" if is_query else "document"
         for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
@@ -269,10 +305,9 @@ class EmbeddingMixin:
             if not torch.isfinite(embeddings).all():
                 raise ValueError("Embeddings contain NaN or Inf values")
-            qs.append(embeddings.detach().cpu())
-        all_embeddings_tensor = self.padding_various_shape_tensor(qs)
-        return all_embeddings_tensor
     def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
         """Forward text queries and extract embeddings.
@@ -357,22 +392,6 @@ class EmbeddingMixin:
 class ColBERTScoringMixin:
     """Mixin providing ColBERT MaxSim scoring methods."""
-    def padding_various_shape_tensor(self, tensors: List[torch.Tensor]) -> torch.Tensor:
-        """Pad tensors of various shapes for ColBERT-like scoring.
-        Args:
-            tensors: List of tensors with shape (batch, seq_len, hidden_dim)
-        Returns:
-            Concatenated tensor with all sequences padded to max length.
-        """
-        max_seq_len = max(t.shape[1] for t in tensors)
-        padded_tensors = [
-            F.pad(t, (0, 0, 0, max_seq_len - t.shape[1]), mode="constant", value=0)
-            for t in tensors
-        ]
-        return torch.cat(padded_tensors, dim=0)
     def colbert_score(
         self,
         qs: Union[torch.Tensor, List[torch.Tensor]],
@@ -448,11 +467,11 @@ class ColBERTScoringMixin:
         if isinstance(query_embeddings, list):
             if len(query_embeddings[0].shape) == 2:
                 query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
-            query_embeddings = self.padding_various_shape_tensor(query_embeddings)
         if isinstance(passage_embeddings, list):
             if len(passage_embeddings[0].shape) == 2:
                 passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
-            passage_embeddings = self.padding_various_shape_tensor(passage_embeddings)
         return self.colbert_score(
             query_embeddings, passage_embeddings, batch_size or 128

         return None
+def _pad_and_stack_embeddings(tensors: List[torch.Tensor]) -> torch.Tensor:
+    """Pad embedding tensors to uniform sequence length and concatenate.
+    Args:
+        tensors: List of tensors with shape (batch, seq_len, hidden_dim).
+            Each tensor may have a different seq_len.
+    Returns:
+        Concatenated tensor with shape (total_batch, max_seq_len, hidden_dim),
+        where sequences shorter than max_seq_len are zero-padded.
+    """
+    if not tensors:
+        raise ValueError("Cannot pad empty tensor list")
+    max_seq_len = max(t.shape[1] for t in tensors)
+    total_docs = sum(t.shape[0] for t in tensors)
+    hidden_dim = tensors[0].shape[2]
+    dtype = tensors[0].dtype
+    # Pre-allocate result tensor
+    result = torch.zeros(total_docs, max_seq_len, hidden_dim, dtype=dtype)
+    # Copy in-place and release references to free memory
+    offset = 0
+    for i in range(len(tensors)):
+        t = tensors[i]
+        tensors[i] = None  # Release reference immediately
+        batch_size = t.shape[0]
+        seq_len = t.shape[1]
+        result[offset : offset + batch_size, :seq_len, :] = t
+        offset += batch_size
+        del t
+    return result
 class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
     """Bidirectional text model for Qwen3VLNemotronEmbed."""
             Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
         """
         device = next(self.parameters()).device
+        embedding_batches = []
         message = "query" if is_query else "document"
         for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
             if not torch.isfinite(embeddings).all():
                 raise ValueError("Embeddings contain NaN or Inf values")
+            embedding_batches.append(embeddings.detach().cpu())
+        return _pad_and_stack_embeddings(embedding_batches)
     def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
         """Forward text queries and extract embeddings.
 class ColBERTScoringMixin:
     """Mixin providing ColBERT MaxSim scoring methods."""
     def colbert_score(
         self,
         qs: Union[torch.Tensor, List[torch.Tensor]],
         if isinstance(query_embeddings, list):
             if len(query_embeddings[0].shape) == 2:
                 query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
+            query_embeddings = _pad_and_stack_embeddings(query_embeddings)
         if isinstance(passage_embeddings, list):
             if len(passage_embeddings[0].shape) == 2:
                 passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
+            passage_embeddings = _pad_and_stack_embeddings(passage_embeddings)
         return self.colbert_score(
             query_embeddings, passage_embeddings, batch_size or 128

nvidia
/

nemotron-colembed-vl-4b-v2

Visual Document Retrieval

qwen3_vl_nemotron_embed

multilingual-embedding

Text-to-Visual Document (T→VD) retrieval

custom_code

Model card Files Files and versions

xet

Community

nvidia-oliver-holworthy commited on 15 days ago

Commit

823b162

unverified ·

1 Parent(s): d8e9858

Update padding implementation reduced memory footprint

Browse files

Files changed (1) hide show

modeling_qwen3_vl_nemotron_embed.py +41 -22

modeling_qwen3_vl_nemotron_embed.py CHANGED Viewed

@@ -94,6 +94,42 @@ def _create_bidirectional_mask(
         return None
 class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
     """Bidirectional text model for Qwen3VLNemotronEmbed."""
@@ -247,7 +283,7 @@ class EmbeddingMixin:
             Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
         """
         device = next(self.parameters()).device
-        qs = []
         message = "query" if is_query else "document"
         for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
@@ -269,10 +305,9 @@ class EmbeddingMixin:
             if not torch.isfinite(embeddings).all():
                 raise ValueError("Embeddings contain NaN or Inf values")
-            qs.append(embeddings.detach().cpu())
-        all_embeddings_tensor = self.padding_various_shape_tensor(qs)
-        return all_embeddings_tensor
     def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
         """Forward text queries and extract embeddings.
@@ -357,22 +392,6 @@ class EmbeddingMixin:
 class ColBERTScoringMixin:
     """Mixin providing ColBERT MaxSim scoring methods."""
-    def padding_various_shape_tensor(self, tensors: List[torch.Tensor]) -> torch.Tensor:
-        """Pad tensors of various shapes for ColBERT-like scoring.
-        Args:
-            tensors: List of tensors with shape (batch, seq_len, hidden_dim)
-        Returns:
-            Concatenated tensor with all sequences padded to max length.
-        """
-        max_seq_len = max(t.shape[1] for t in tensors)
-        padded_tensors = [
-            F.pad(t, (0, 0, 0, max_seq_len - t.shape[1]), mode="constant", value=0)
-            for t in tensors
-        ]
-        return torch.cat(padded_tensors, dim=0)
     def colbert_score(
         self,
         qs: Union[torch.Tensor, List[torch.Tensor]],
@@ -448,11 +467,11 @@ class ColBERTScoringMixin:
         if isinstance(query_embeddings, list):
             if len(query_embeddings[0].shape) == 2:
                 query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
-            query_embeddings = self.padding_various_shape_tensor(query_embeddings)
         if isinstance(passage_embeddings, list):
             if len(passage_embeddings[0].shape) == 2:
                 passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
-            passage_embeddings = self.padding_various_shape_tensor(passage_embeddings)
         return self.colbert_score(
             query_embeddings, passage_embeddings, batch_size or 128

         return None
+def _pad_and_stack_embeddings(tensors: List[torch.Tensor]) -> torch.Tensor:
+    """Pad embedding tensors to uniform sequence length and concatenate.
+    Args:
+        tensors: List of tensors with shape (batch, seq_len, hidden_dim).
+            Each tensor may have a different seq_len.
+    Returns:
+        Concatenated tensor with shape (total_batch, max_seq_len, hidden_dim),
+        where sequences shorter than max_seq_len are zero-padded.
+    """
+    if not tensors:
+        raise ValueError("Cannot pad empty tensor list")
+    max_seq_len = max(t.shape[1] for t in tensors)
+    total_docs = sum(t.shape[0] for t in tensors)
+    hidden_dim = tensors[0].shape[2]
+    dtype = tensors[0].dtype
+    # Pre-allocate result tensor
+    result = torch.zeros(total_docs, max_seq_len, hidden_dim, dtype=dtype)
+    # Copy in-place and release references to free memory
+    offset = 0
+    for i in range(len(tensors)):
+        t = tensors[i]
+        tensors[i] = None  # Release reference immediately
+        batch_size = t.shape[0]
+        seq_len = t.shape[1]
+        result[offset : offset + batch_size, :seq_len, :] = t
+        offset += batch_size
+        del t
+    return result
 class Qwen3VLNemotronEmbedTextModel(Qwen3VLTextModel):
     """Bidirectional text model for Qwen3VLNemotronEmbed."""
             Tensor of embeddings with shape (num_samples, max_seq_len, hidden_dim).
         """
         device = next(self.parameters()).device
+        embedding_batches = []
         message = "query" if is_query else "document"
         for batch in tqdm(dataloader, desc=f"Extracting {message} embeddings..."):
             if not torch.isfinite(embeddings).all():
                 raise ValueError("Embeddings contain NaN or Inf values")
+            embedding_batches.append(embeddings.detach().cpu())
+        return _pad_and_stack_embeddings(embedding_batches)
     def forward_queries(self, queries: List[str], batch_size: int = 8) -> torch.Tensor:
         """Forward text queries and extract embeddings.
 class ColBERTScoringMixin:
     """Mixin providing ColBERT MaxSim scoring methods."""
     def colbert_score(
         self,
         qs: Union[torch.Tensor, List[torch.Tensor]],
         if isinstance(query_embeddings, list):
             if len(query_embeddings[0].shape) == 2:
                 query_embeddings = [q.unsqueeze(0) for q in query_embeddings]
+            query_embeddings = _pad_and_stack_embeddings(query_embeddings)
         if isinstance(passage_embeddings, list):
             if len(passage_embeddings[0].shape) == 2:
                 passage_embeddings = [p.unsqueeze(0) for p in passage_embeddings]
+            passage_embeddings = _pad_and_stack_embeddings(passage_embeddings)
         return self.colbert_score(
             query_embeddings, passage_embeddings, batch_size or 128