Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/NVIDIA/cutlass/llms.txt

Use this file to discover all available pages before exploring further.

The CuTe DSL provides comprehensive memory management APIs for creating and manipulating tensors across different memory spaces (global, shared, register).

Tensor Creation

make_tensor

Creates a tensor from a pointer and layout.
tensor = cute.make_tensor(ptr, layout)
ptr
Pointer
required
Pointer to memory (global, shared, or register)
layout
Layout
required
Layout defining shape and stride
Example:
# Create tensor from global memory pointer
ptr = cute.make_ptr(data, element_type=cute.Float32)
layout = cute.make_layout(
    shape=(M, N),
    stride=(N, 1)  # Row-major
)
tensor = cute.make_tensor(ptr, layout)

make_fragment

Allocates a register memory tensor (fragment).
fragment = cute.make_fragment(element_type, layout)
element_type
Type[Numeric]
required
Element data type (e.g., cute.Float32, cute.BFloat16)
layout
Layout
required
Layout of the fragment
Example:
# Allocate 16x8 fragment in registers
layout = cute.make_layout((16, 8))
fragment = cute.make_fragment(cute.Float32, layout)

make_fragment_like

Allocates a fragment with the same shape as another tensor.
fragment = cute.make_fragment_like(tensor)
tensor
Tensor
required
Template tensor
Example:
# Create fragment matching partitioned tensor
thrA = thr_copy.partition_S(blockA)
fragA = cute.make_fragment_like(thrA)

make_rmem_tensor

Allocates register memory tensor.
tensor = cute.make_rmem_tensor(element_type, layout)
Alias for make_fragment.

make_rmem_tensor_like

Allocates register memory tensor matching another tensor.
tensor = cute.make_rmem_tensor_like(template_tensor)
Alias for make_fragment_like.

Tensor Initialization

zeros_like

Creates a tensor initialized to zero.
zero_tensor = cute.zeros_like(tensor)
Example:
# Initialize accumulator to zero
accum = cute.zeros_like(fragC)

ones_like

Creates a tensor initialized to one.
one_tensor = cute.ones_like(tensor)

full

Creates a tensor filled with a specific value.
filled_tensor = cute.full(layout, value, element_type)
layout
Layout
required
Tensor layout
value
Numeric
required
Fill value
element_type
Type[Numeric]
required
Element data type

full_like

Creates a tensor filled with a value, matching another tensor’s shape.
filled_tensor = cute.full_like(tensor, value)
Example:
# Initialize accumulator to identity (for debugging)
identity = cute.full_like(fragC, 1.0)

empty_like

Allocates an uninitialized tensor matching another tensor’s shape.
empty_tensor = cute.empty_like(tensor)

Pointer Operations

make_ptr

Creates a typed pointer from raw memory address.
ptr = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.GLOBAL
)
address
Union[int, Tensor]
required
Memory address or tensor
element_type
Type[Numeric]
required
Element data type
address_space
AddressSpace
default:"GLOBAL"
Memory address space
Example:
# Create pointer from DLPack tensor
from cutlass.cute.runtime import from_dlpack

tensor_desc = from_dlpack(torch_tensor)
ptr = cute.make_ptr(
    tensor_desc,
    element_type=cute.Float16,
    address_space=cute.AddressSpace.GLOBAL
)

recast_ptr

Recasts a pointer to a different element type.
new_ptr = cute.recast_ptr(ptr, new_element_type)
ptr
Pointer
required
Original pointer
new_element_type
Type[Numeric]
required
New element data type
Example:
# Recast FP16 pointer to INT8 for quantization
f16_ptr = cute.make_ptr(data, element_type=cute.Float16)
i8_ptr = cute.recast_ptr(f16_ptr, cute.Int8)

Layout Operations

make_layout

Creates a layout from shape and stride.
layout = cute.make_layout(
    shape=(M, N),
    stride=(N, 1)  # Row-major
)
shape
Shape
required
Shape tuple (can be nested)
stride
Stride
default:"compact"
Stride tuple (defaults to compact row-major)
Examples:
# Row-major 2D
layout = cute.make_layout((M, N), stride=(N, 1))

# Column-major 2D
layout = cute.make_layout((M, N), stride=(1, M))

# Hierarchical layout
layout = cute.make_layout(
    shape=((4, 8), (2, 16)),
    stride=((128, 1), (64, 8))
)

make_identity_layout

Creates an identity layout (coordinate layout).
layout = cute.make_identity_layout(shape)
Example:
# Create coordinate layout for predication
coord_layout = cute.make_identity_layout((M, N))

make_ordered_layout

Creates a layout with specified dimension ordering.
layout = cute.make_ordered_layout(
    shape=(M, N),
    order=(1, 0)  # Column-major
)

make_layout_like

Creates a layout matching another layout’s shape.
new_layout = cute.make_layout_like(template_layout)

recast_layout

Recasts a layout to a different element type.
new_layout = cute.recast_layout(layout, old_type, new_type)
Example:
# Recast layout from FP32 to FP16 (doubles logical size)
f16_layout = cute.recast_layout(f32_layout, cute.Float32, cute.Float16)

Identity and Coordinate Tensors

make_identity_tensor

Creates a coordinate tensor for bounds checking.
coord_tensor = cute.make_identity_tensor(shape)
shape
Shape
required
Tensor shape
Example:
# Create coordinate tensor for predication
shape = (M, N)
coord = cute.make_identity_tensor(shape)

# Partition to threads
thr_coord = thr_copy.partition_S(coord)

# Create predicate for out-of-bounds
pred = thr_coord < (M, N)

# Predicated copy
cute.basic_copy_if(pred, src, dst)

Tensor Recasting

recast_tensor

Recasts a tensor to a different element type.
new_tensor = cute.recast_tensor(tensor, new_element_type)
tensor
Tensor
required
Original tensor
new_element_type
Type[Numeric]
required
New element data type
Example:
# Recast for type conversion
f16_tensor = cute.recast_tensor(f32_tensor, cute.Float16)

Memory Space Management

Address Spaces

# Global memory
ptr_global = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.GLOBAL
)

# Shared memory
ptr_shared = cute.make_ptr(
    address,
    element_type=cute.Float32,
    address_space=cute.AddressSpace.SHARED
)

# Register memory (fragments)
fragment = cute.make_fragment(cute.Float32, layout)

Tensor Partitioning

Partition by Tiled Copy

# Partition source tensor
thrA_src = thr_copy.partition_S(block_tensor)

# Partition destination tensor
thrA_dst = thr_copy.partition_D(fragment)

Partition by Tiled MMA

# Partition A operand
thrA = thr_mma.partition_A(block_A)

# Partition B operand
thrB = thr_mma.partition_B(block_B)

# Partition C (accumulator)
thrC = thr_mma.partition_C(block_C)

Complete Example: Memory Management

import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack

@cute.kernel
def memory_example_kernel(
    gA: cute.Tensor,  # Global memory input
    gB: cute.Tensor,  # Global memory output
    sA_layout: cute.Layout,  # Shared memory layout
):
    tidx, _, _ = cute.arch.thread_idx()
    bidx, _, _ = cute.arch.block_idx()
    
    # Allocate shared memory
    sA = cute.make_tensor(
        cute.make_ptr(
            cute.arch.smem_ptr(0),
            element_type=gA.element_type,
            address_space=cute.AddressSpace.SHARED
        ),
        sA_layout
    )
    
    # Partition global memory block
    blkA = gA[((None, None), bidx)]
    blkB = gB[((None, None), bidx)]
    
    # Create tiled copy
    copy_atom = cute.make_copy_atom(
        cute.nvgpu.CopyUniversalOp(),
        gA.element_type
    )
    thr_layout = cute.make_layout((4, 32), stride=(32, 1))
    val_layout = cute.make_layout((1, 4), stride=(4, 1))
    tiled_copy = cute.make_tiled_copy_tv(
        copy_atom,
        thr_layout,
        val_layout
    )
    thr_copy = tiled_copy.get_slice(tidx)
    
    # Partition tensors
    thrA_gmem = thr_copy.partition_S(blkA)
    thrA_smem = thr_copy.partition_D(sA)
    
    # Allocate register fragment
    fragA = cute.make_fragment_like(thrA_gmem)
    
    # Load: Global -> Register
    cute.copy(copy_atom, thrA_gmem, fragA)
    
    # Store: Register -> Shared
    cute.copy(copy_atom, fragA, thrA_smem)
    
    # Synchronize threads
    cute.arch.syncthreads()
    
    # Partition output
    thrB = thr_copy.partition_D(blkB)
    
    # Process data (example: simple copy)
    fragB = cute.make_fragment_like(thrB)
    cute.basic_copy(thrA_smem, fragB)
    
    # Store: Register -> Global
    cute.copy(copy_atom, fragB, thrB)


@cute.jit
def memory_example(
    torch_A,
    torch_B,
    stream,
):
    # Convert from DLPack
    A_desc = from_dlpack(torch_A)
    B_desc = from_dlpack(torch_B)
    
    # Create tensors
    M, N = torch_A.shape
    gA = cute.make_tensor(
        cute.make_ptr(A_desc, element_type=cute.Float32),
        cute.make_layout((M, N), stride=(N, 1))
    )
    gB = cute.make_tensor(
        cute.make_ptr(B_desc, element_type=cute.Float32),
        cute.make_layout((M, N), stride=(N, 1))
    )
    
    # Shared memory layout
    tile_M, tile_N = 128, 128
    sA_layout = cute.make_layout((tile_M, tile_N))
    
    # Launch kernel
    num_blocks = (M + tile_M - 1) // tile_M
    memory_example_kernel(gA, gB, sA_layout).launch(
        grid=(num_blocks, 1, 1),
        block=(128, 1, 1),
        stream=stream
    )

See Also