mtmd : Add Nemotron Nano 12B v2 VL support (#19547)
* nemotron nano v2 vlm support added * simplified code; addressed reviews * pre-downsample position embeddings during GGUF conversion for fixed input size
This commit is contained in:
parent
1725e316c1
commit
01d8eaa28d
9 changed files with 167 additions and 1 deletions
|
|
@ -4074,6 +4074,87 @@ class InternVisionModel(MmprojModel):
|
|||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"NemotronH_Nano_VL_V2",
|
||||
"RADIOModel",
|
||||
)
|
||||
class NemotronNanoV2VLModel(MmprojModel):
|
||||
# ViT-Huge architecture parameters for RADIO v2.5-h
|
||||
_vit_hidden_size = 1280
|
||||
_vit_intermediate_size = 5120
|
||||
_vit_num_layers = 32
|
||||
_vit_num_heads = 16
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
# RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
|
||||
vision_config = self.global_config.get("vision_config")
|
||||
if vision_config is None:
|
||||
return None
|
||||
# Add ViT-H parameters
|
||||
vision_config = {
|
||||
**vision_config,
|
||||
"hidden_size": self._vit_hidden_size,
|
||||
"intermediate_size": self._vit_intermediate_size,
|
||||
"num_hidden_layers": self._vit_num_layers,
|
||||
"num_attention_heads": self._vit_num_heads,
|
||||
"image_size": self.global_config.get("force_image_size", 512),
|
||||
}
|
||||
return vision_config
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
if "image_mean" not in self.preprocessor_config:
|
||||
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
|
||||
if "image_std" not in self.preprocessor_config:
|
||||
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
|
||||
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.global_config
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
downsample_ratio = hparams.get("downsample_ratio", 0.5)
|
||||
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".position_embd." in new_name or "pos_embed" in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if "input_conditioner" in name:
|
||||
return
|
||||
|
||||
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
|
||||
if "patch_generator.pos_embed" in name:
|
||||
if not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
# Downsample position embeddings for fixed 512x512 image size
|
||||
import torch.nn.functional as F
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
image_size = self.global_config.get("force_image_size", 512)
|
||||
patch_size = self.hparams["patch_size"]
|
||||
target_patches_per_side = image_size // patch_size # 32
|
||||
max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
|
||||
if target_patches_per_side != max_patches_per_side:
|
||||
# Reshape to grid, interpolate, flatten back
|
||||
data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
|
||||
data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
|
||||
data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
|
||||
mode='bilinear', align_corners=True)
|
||||
data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
|
||||
data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
|
||||
|
||||
# Reshape linear patch embedding to conv2d format for ggml_conv_2d
|
||||
# From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
|
||||
if "patch_generator.embedder" in name:
|
||||
patch_size = self.hparams["patch_size"]
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
|
||||
|
||||
if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
class WavTokenizerDecModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
||||
|
|
@ -7055,6 +7136,8 @@ class Mamba2Model(TextModel):
|
|||
if hparams is None:
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
if "llm_config" in hparams:
|
||||
hparams["text_config"] = hparams["llm_config"]
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
||||
|
|
@ -9542,6 +9625,14 @@ class NemotronHModel(GraniteHybridModel):
|
|||
self.gguf_writer.add_add_bos_token(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
|
||||
if name.startswith(("vision_model.", "mlp1.")):
|
||||
return
|
||||
|
||||
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
|
||||
if name.startswith("language_model."):
|
||||
name = name[len("language_model."):]
|
||||
|
||||
if self.is_moe and bid is not None:
|
||||
if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue