vllm.model_executor.layers.quantization.utils.marlin_utils ¶
   _check_marlin_supported ¶
 _check_marlin_supported(
    quant_type: ScalarType,
    group_size: int | None,
    has_zp: bool,
    device_capability: int | None = None,
) -> tuple[bool, str | None]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   apply_awq_marlin_linear ¶
 apply_awq_marlin_linear(
    input: Tensor,
    weight: Tensor,
    weight_scale: Tensor,
    weight_zp: Tensor,
    g_idx: Tensor,
    g_idx_sort_indices: Tensor,
    workspace: Tensor,
    quant_type: ScalarType,
    output_size_per_partition: int,
    input_size_per_partition: int,
    bias: Tensor | None = None,
    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   apply_gptq_marlin_linear ¶
 apply_gptq_marlin_linear(
    input: Tensor,
    weight: Tensor,
    weight_scale: Tensor,
    weight_zp: Tensor,
    g_idx: Tensor,
    g_idx_sort_indices: Tensor,
    workspace: Tensor,
    wtype: ScalarType,
    output_size_per_partition: int,
    input_size_per_partition: int,
    is_k_full: bool,
    bias: Tensor | None = None,
    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   apply_rtn_marlin_linear ¶
 apply_rtn_marlin_linear(
    input: Tensor,
    weight: Tensor,
    weight_scale: Tensor,
    workspace: Tensor,
    quant_type: ScalarType,
    output_size_per_partition: int,
    input_size_per_partition: int,
    bias: Tensor | None = None,
    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   awq_to_marlin_zero_points ¶
 awq_to_marlin_zero_points(
    q_zp_packed: Tensor,
    size_k: int,
    size_n: int,
    num_bits: int,
) -> Tensor
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   check_marlin_supported ¶
 check_marlin_supported(
    quant_type: ScalarType,
    group_size: int,
    has_zp: bool = False,
    device_capability: int | None = None,
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
    check_marlin_supports_layer ¶
 check_marlin_supports_layer(
    layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   check_marlin_supports_shape ¶
 check_marlin_supports_shape(
    output_size_per_partition: int,
    input_size_per_partition: int,
    input_size: int,
    group_size: int,
) -> tuple[bool, str | None]
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   check_moe_marlin_supports_layer ¶
 check_moe_marlin_supports_layer(
    layer: LinearBase, group_size: int
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   get_scale_perms ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_is_k_full ¶
     marlin_make_empty_g_idx ¶
     marlin_make_empty_zp ¶
     marlin_make_workspace ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_make_workspace_new ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_moe_intermediate_size ¶
  Given Marlin packed weight matrices w1_packed, and w2_packed, return the MoE intermediate size N
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
    marlin_moe_permute_scales ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_permute_bias ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
    marlin_permute_scales ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_repeat_scales_on_all_ranks ¶
 marlin_repeat_scales_on_all_ranks(
    act_order: bool, group_size: int, is_row_parallel: bool
) -> bool
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   marlin_sort_g_idx ¶
     marlin_zero_points ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   maybe_warn_marlin_atomic_add ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   maybe_warn_marlin_atomic_add_env ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   moe_awq_to_marlin_zero_points ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   query_marlin_supported_quant_types ¶
 query_marlin_supported_quant_types(
    has_zp: bool | None = None,
    include_fp_type: bool = True,
    device_capability: int | None = None,
)
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   should_use_atomic_add_reduce ¶
  Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
   verify_marlin_supported ¶
 verify_marlin_supported(
    quant_type: ScalarType,
    group_size: int,
    has_zp: bool = False,
) -> None
Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py
    verify_marlin_supports_shape ¶
 verify_marlin_supports_shape(
    output_size_per_partition: int,
    input_size_per_partition: int,
    input_size: int,
    group_size: int,
) -> None