SAM3：开放式分割，太强了（后面有SAM3权重下载方式）(单图测试、视频测试、实时跟踪)

安装过程：后续有权重下载地址

https://github.com/facebookresearch/sam3?tab=readme-ov-file

按照上述内容直接安装。

没魔法的版本：

创建一个新的 Conda 环境：

conda create -n sam3 python=3.12
conda deactivate
conda activate sam3

安装2.7torch

pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

克隆环境

通过网盘分享的文件：sam3代码
链接: https://pan.baidu.com/s/1fx6HjeO4NAwqmY3FpF1_rA?pwd=1111 提取码: 1111
进入目录后终端输入：

pip install -e .

安装其他依赖：

# For running example notebooks
pip install -e ".[notebooks]"

# For development
pip install -e ".[train,dev]"

windows安装过程中出现的问题

如果出现了关于 triton 的报错问题，给予下述解决方法：

在终端新建立的环境中进行以下内容的安装：

pip install triton-windows

给一个简单的图片测试案例：

import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor

# Load the model
model = build_sam3_image_model(checkpoint_path="checkpoint/sam3.pt")

processor = Sam3Processor(model)
# Load an image
image = Image.open(r"D:\vscode\python_project\sam3-main\cat1.jpg")
inference_state = processor.set_image(image)
# Prompt the model with text
output = processor.set_text_prompt(state=inference_state, prompt="A cat's ears")

# Get the masks, bounding boxes, and scores
masks, boxes, scores = output["masks"], output["boxes"], output["scores"]

# 转移到CPU
masks = masks.cpu().numpy()
boxes = boxes.cpu().numpy()
scores = scores.cpu().numpy()

print(f"找到 {len(masks)} 个目标")
print(f"置信度分数: {scores}")
print(f"边界框:\n{boxes}")

# 创建颜色映射，为每个实例分配不同颜色
colors = plt.cm.Set3(np.linspace(0, 1, len(masks)))

# 创建一张包含所有实例的大图
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

# 1. 显示原图
axes[0].imshow(image)
axes[0].set_title("Original Image")
axes[0].axis('off')

# 2. 显示所有实例的合成mask
img_array = np.array(image)
all_masks_overlay = img_array.copy()

for i, (mask, score, color) in enumerate(zip(masks, scores, colors)):
    # 确保mask是2D的
    if len(mask.shape) == 3:
        mask = mask[0]
    
    # 调整mask大小以匹配图像
    if mask.shape != img_array.shape[:2]:
        from scipy.ndimage import zoom
        scale_y = img_array.shape[0] / mask.shape[0]
        scale_x = img_array.shape[1] / mask.shape[1]
        mask = zoom(mask, (scale_y, scale_x), order=0) > 0.5
    
    # 为每个mask创建彩色覆盖
    mask_bool = mask > 0.5
    # 使用不同颜色的半透明覆盖
    rgb_color = color[:3]  # 取RGB值，忽略alpha
    all_masks_overlay[mask_bool] = all_masks_overlay[mask_bool] * 0.4 + np.array(rgb_color) * 255 * 0.6

axes[1].imshow(all_masks_overlay.astype(np.uint8))
axes[1].set_title(f"All Masks Overlay\n({len(masks)} instances)")
axes[1].axis('off')

# 3. 显示带边界框的原图
axes[2].imshow(image)
for i, (box, score, color) in enumerate(zip(boxes, scores, colors)):
    x1, y1, x2, y2 = box
    rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, 
                         fill=False, color=color, linewidth=3)
    axes[2].add_patch(rect)
    # 添加标签
    axes[2].text(x1, y1-5, f"Obj {i+1}: {score:.3f}", 
                bbox=dict(boxstyle="round,pad=0.3", fc=color, alpha=0.7),
                fontsize=8, color='black')
axes[2].set_title("Bounding Boxes with Scores")
axes[2].axis('off')

# 4. 显示所有mask的合成图（黑白）
combined_mask = np.zeros(img_array.shape[:2], dtype=bool)
for i, mask in enumerate(masks):
    if len(mask.shape) == 3:
        mask = mask[0]
    
    # 调整mask大小以匹配图像
    if mask.shape != img_array.shape[:2]:
        from scipy.ndimage import zoom
        scale_y = img_array.shape[0] / mask.shape[0]
        scale_x = img_array.shape[1] / mask.shape[1]
        mask = zoom(mask, (scale_y, scale_x), order=0) > 0.5
    
    combined_mask = np.logical_or(combined_mask, mask > 0.5)

axes[3].imshow(combined_mask, cmap='gray')
axes[3].set_title(f"Combined Mask\n({len(masks)} instances)")
axes[3].axis('off')

plt.tight_layout()
plt.savefig("all_instances_result.png", dpi=150, bbox_inches='tight')
print("\n所有实例结果已保存到 all_instances_result.png")
plt.show()

# 保存单独的mask
for i, mask in enumerate(masks):
    if len(mask.shape) == 3:
        mask = mask[0]
    mask_image = Image.fromarray((mask * 255).astype(np.uint8))
    mask_image.save(f"mask_{i}.png")
    print(f"Mask {i} 已保存到 mask_{i}.png")

# 额外：创建一个包含所有实例的详细对比图
if len(masks) > 0:
    # 计算需要多少行和列
    n_cols = min(4, len(masks))
    n_rows = (len(masks) + n_cols - 1) // n_cols
    
    fig2, axes2 = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 4*n_rows))
    if n_rows == 1:
        axes2 = axes2.reshape(1, -1)
    elif n_cols == 1:
        axes2 = axes2.reshape(-1, 1)
    
    for i, (mask, box, score) in enumerate(zip(masks, boxes, scores)):
        row = i // n_cols
        col = i % n_cols
        
        # 确保mask是2D的
        if len(mask.shape) == 3:
            mask = mask[0]
        
        # 调整mask大小以匹配图像
        if mask.shape != img_array.shape[:2]:
            from scipy.ndimage import zoom
            scale_y = img_array.shape[0] / mask.shape[0]
            scale_x = img_array.shape[1] / mask.shape[1]
            mask = zoom(mask, (scale_y, scale_x), order=0) > 0.5
        
        # 创建彩色mask overlay
        overlay = img_array.copy()
        mask_bool = mask > 0.5
        color = colors[i]
        rgb_color = color[:3]
        overlay[mask_bool] = overlay[mask_bool] * 0.5 + np.array(rgb_color) * 255 * 0.5
        
        axes2[row, col].imshow(overlay.astype(np.uint8))
        
        # 绘制边界框
        x1, y1, x2, y2 = box
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, 
                             fill=False, color=color, linewidth=2)
        axes2[row, col].add_patch(rect)
        
        axes2[row, col].set_title(f"Instance {i+1}\nScore: {score:.3f}")
        axes2[row, col].axis('off')
    
    # 隐藏多余的子图
    for i in range(len(masks), n_rows * n_cols):
        row = i // n_cols
        col = i % n_cols
        axes2[row, col].axis('off')
    
    plt.tight_layout()
    plt.savefig("detailed_instances_comparison.png", dpi=150, bbox_inches='tight')
    print("详细实例对比图已保存到 detailed_instances_comparison.png")
    plt.show()

在代码中我们给出了一个文本的prompt为：“cat”

分割效果是真的无敌，当然美中不足之处在于第6个实例猫的尾巴没有被分割好，同实例1所示，分割效果不错。

视频测试代码如下：

# import torch
# import numpy as np
# from PIL import Image
# import cv2
# from sam3.model_builder import build_sam3_video_predictor

# # ============ 配置参数 ============
# VIDEO_PATH = "2.mp4"
# TEXT_PROMPT = "Bottle"
# CHECKPOINT_PATH = "checkpoint/sam3.pt"
# OUTPUT_VIDEO_PATH = "output_video_with_masks.mp4"

# # ============ 1. 加载模型 ============
# print("加载模型...")
# video_predictor = build_sam3_video_predictor(checkpoint_path=CHECKPOINT_PATH)

# # ============ 2. 读取视频信息 ============
# cap = cv2.VideoCapture(VIDEO_PATH)
# fps = int(cap.get(cv2.CAP_PROP_FPS))
# width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
# height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# cap.release()
# print(f"视频信息: {width}x{height}, {fps}fps, 总帧数: {total_frames}")

# # ============ 3. 开始会话 ============
# print(f"\n开始处理视频: {VIDEO_PATH}")
# response = video_predictor.handle_request(
#     request=dict(
#         type="start_session",
#         resource_path=VIDEO_PATH,
#     )
# )
# session_id = response["session_id"]
# print(f"会话ID: {session_id}")

# # ============ 4. 在第0帧添加文本提示 ============
# print(f"\n添加文本提示: '{TEXT_PROMPT}'")
# response = video_predictor.handle_request(
#     request=dict(
#         type="add_prompt",
#         session_id=session_id,
#         frame_index=0,
#         text=TEXT_PROMPT,
#     )
# )

# # 获取第0帧的输出
# output = response["outputs"]
# print(f"第0帧输出:")
# print(f"  out_obj_ids: {output['out_obj_ids']}")
# print(f"  out_probs: {output['out_probs']}")
# print(f"  out_binary_masks shape: {output['out_binary_masks'].shape}")

# # ============ 5. 使用 handle_stream_request 调用 propagate_in_video ============
# print("\n开始传播分割到所有帧 (使用 handle_stream_request)...")

# # 存储所有帧的mask
# frame_masks = {}

# # 关键！使用 handle_stream_request 而不是 handle_request
# # propagate_in_video 是一个生成器，通过 handle_stream_request 调用
# for result in video_predictor.handle_stream_request(
#     request=dict(
#         type="propagate_in_video",
#         session_id=session_id,
#         propagation_direction="forward",  # 可以是 "forward", "backward", "both"
#         start_frame_index=0,
#         max_frame_num_to_track=None,  # None表示跟踪所有帧
#     )
# ):
#     frame_idx = result["frame_index"]
#     outputs = result["outputs"]
#     frame_masks[frame_idx] = outputs
    
#     if frame_idx % 50 == 0:
#         print(f"  传播进度: 帧 {frame_idx}")

# print(f"\n共获取 {len(frame_masks)} 帧的分割结果")

# # 打印第一个有效帧的输出结构
# if frame_masks:
#     first_key = list(frame_masks.keys())[0]
#     first_output = frame_masks[first_key]
#     print(f"\n帧输出结构 (帧 {first_key}):")
#     for k, v in first_output.items():
#         if isinstance(v, np.ndarray):
#             print(f"  {k}: ndarray shape={v.shape}")
#         elif isinstance(v, torch.Tensor):
#             print(f"  {k}: Tensor shape={v.shape}")
#         else:
#             print(f"  {k}: {type(v).__name__}")

# # ============ 6. 写入输出视频 ============
# print("\n开始写入输出视频...")
# cap = cv2.VideoCapture(VIDEO_PATH)
# fourcc = cv2.VideoWriter_fourcc(*'mp4v')
# out = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# for frame_idx in range(total_frames):
#     ret, frame = cap.read()
#     if not ret:
#         break
    
#     result_frame = frame.copy()
    
#     # 获取当前帧的mask
#     if frame_idx in frame_masks:
#         data = frame_masks[frame_idx]
#         masks = data['out_binary_masks']  # shape: (num_objects, H, W)
#         probs = data['out_probs']
#         boxes = data.get('out_boxes_xywh', None)
#         obj_ids = data.get('out_obj_ids', None)
        
#         # 为每个目标分配不同的颜色 (BGR格式)
#         colors = [
#             (0, 255, 0),    # 绿色
#             (255, 0, 0),    # 蓝色
#             (0, 255, 255),  # 黄色
#             (255, 0, 255),  # 紫色
#             (0, 165, 255),  # 橙色
#             (255, 255, 0),  # 青色
#         ]
        
#         # 检查是否有有效的mask
#         if masks is not None and len(masks) > 0 and masks.shape[0] > 0:
#             num_objects = masks.shape[0]
            
#             # 遍历所有检测到的目标
#             for obj_idx in range(num_objects):
#                 # 获取当前目标的mask
#                 mask = masks[obj_idx]
                
#                 # 处理mask尺寸
#                 if mask.shape == (width, height):  # (1280, 720)
#                     mask = mask.T  # 转为 (720, 1280)
#                 elif mask.shape != (height, width):
#                     mask = cv2.resize(mask.astype(np.float32), (width, height),
#                                      interpolation=cv2.INTER_NEAREST)
                
#                 # 确保是二值mask
#                 mask_bool = mask > 0.5
                
#                 if np.any(mask_bool):
#                     # 选择颜色
#                     color = colors[obj_idx % len(colors)]
                    
#                     # 创建彩色半透明覆盖
#                     overlay = result_frame.copy()
#                     overlay[mask_bool] = color
#                     result_frame = cv2.addWeighted(result_frame, 0.6, overlay, 0.4, 0)
                    
#                     # 绘制轮廓 (使用相同颜色但更亮)
#                     mask_uint8 = (mask_bool * 255).astype(np.uint8)
#                     contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL,
#                                                    cv2.CHAIN_APPROX_SIMPLE)
#                     cv2.drawContours(result_frame, contours, -1, color, 2)
                    
#                     # 绘制边界框
#                     if boxes is not None and obj_idx < len(boxes):
#                         box = boxes[obj_idx]
#                         x, y, w, h = box
#                         cv2.rectangle(result_frame, (int(x), int(y)), 
#                                      (int(x + w), int(y + h)), color, 2)
                        
#                         # 在边界框上方显示目标ID和置信度
#                         prob = probs[obj_idx] if probs is not None and obj_idx < len(probs) else 0
#                         obj_id = obj_ids[obj_idx] if obj_ids is not None and obj_idx < len(obj_ids) else obj_idx
#                         label = f"ID:{obj_id} P:{prob:.2f}"
#                         cv2.putText(result_frame, label, (int(x), int(y) - 5),
#                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
#             # 显示检测到的目标数量
#             cv2.putText(result_frame, f"Objects: {num_objects}", (10, 60),
#                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
#         else:
#             # 没有检测到目标，显示提示
#             cv2.putText(result_frame, "No detection", (10, 60),
#                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
    
#     # 添加帧号
#     cv2.putText(result_frame, f"Frame: {frame_idx}", (10, 30),
#                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    
#     out.write(result_frame)
    
#     if frame_idx % 50 == 0:
#         print(f"  写入进度: {frame_idx}/{total_frames}")

# cap.release()
# out.release()

# # ============ 7. 关闭会话 ============
# video_predictor.handle_request(
#     request=dict(type="close_session", session_id=session_id)
# )

# print(f"\n完成！输出保存到: {OUTPUT_VIDEO_PATH}")

# # ============ 8. 创建预览GIF ============
# print("\n创建预览GIF...")
# cap = cv2.VideoCapture(OUTPUT_VIDEO_PATH)
# gif_frames = []
# for i in range(min(50, total_frames)):
#     ret, frame = cap.read()
#     if not ret:
#         break
#     frame_small = cv2.resize(frame, (320, 180))
#     gif_frames.append(Image.fromarray(cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)))

# if gif_frames:
#     gif_frames[0].save("preview.gif", save_all=True, append_images=gif_frames[1:],
#                        duration=100, loop=0)
#     print("预览GIF已保存: preview.gif")

# cap.release()
# print("\n全部完成！")

import torch
import numpy as np
from PIL import Image
import cv2
from sam3.model_builder import build_sam3_video_predictor

# ============ 配置参数 ============
VIDEO_PATH = "5.mp4"
TEXT_PROMPT = "people"
CHECKPOINT_PATH = "checkpoint/sam3.pt"
OUTPUT_VIDEO_PATH = "output_video_with_masks.mp4"

# ============ 1. 加载模型 ============
print("加载模型...")
video_predictor = build_sam3_video_predictor(checkpoint_path=CHECKPOINT_PATH)

# ============ 2. 读取视频信息 ============
cap = cv2.VideoCapture(VIDEO_PATH)
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.release()
print(f"视频信息: {width}x{height}, {fps}fps, 总帧数: {total_frames}")

# ============ 3. 开始会话 ============
print(f"\n开始处理视频: {VIDEO_PATH}")
response = video_predictor.handle_request(
    request=dict(
        type="start_session",
        resource_path=VIDEO_PATH,
    )
)
session_id = response["session_id"]
print(f"会话ID: {session_id}")

# ============ 4. 在第0帧添加文本提示 ============
print(f"\n添加文本提示: '{TEXT_PROMPT}'")
response = video_predictor.handle_request(
    request=dict(
        type="add_prompt",
        session_id=session_id,
        frame_index=0,
        text=TEXT_PROMPT,
    )
)

# 获取第0帧的输出
output = response["outputs"]
print(f"第0帧输出:")
print(f"  out_obj_ids: {output['out_obj_ids']}")
print(f"  out_probs: {output['out_probs']}")
print(f"  out_binary_masks shape: {output['out_binary_masks'].shape}")

# ============ 5. 使用 handle_stream_request 调用 propagate_in_video ============
print("\n开始传播分割到所有帧 (使用 handle_stream_request)...")

# 存储所有帧的mask
frame_masks = {}

# 关键！使用 handle_stream_request 而不是 handle_request
# propagate_in_video 是一个生成器，通过 handle_stream_request 调用
for result in video_predictor.handle_stream_request(
    request=dict(
        type="propagate_in_video",
        session_id=session_id,
        propagation_direction="forward",  # 可以是 "forward", "backward", "both"
        start_frame_index=0,
        max_frame_num_to_track=None,  # None表示跟踪所有帧
    )
):
    frame_idx = result["frame_index"]
    outputs = result["outputs"]
    frame_masks[frame_idx] = outputs
    
    if frame_idx % 50 == 0:
        print(f"  传播进度: 帧 {frame_idx}")

print(f"\n共获取 {len(frame_masks)} 帧的分割结果")

# 打印第一个有效帧的输出结构
if frame_masks:
    first_key = list(frame_masks.keys())[0]
    first_output = frame_masks[first_key]
    print(f"\n帧输出结构 (帧 {first_key}):")
    for k, v in first_output.items():
        if isinstance(v, np.ndarray):
            print(f"  {k}: ndarray shape={v.shape}")
        elif isinstance(v, torch.Tensor):
            print(f"  {k}: Tensor shape={v.shape}")
        else:
            print(f"  {k}: {type(v).__name__}")

# ============ 6. 写入输出视频 ============
print("\n开始写入输出视频...")
cap = cv2.VideoCapture(VIDEO_PATH)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

# 为每个目标分配不同的颜色 (BGR格式)
colors = [
    (0, 255, 0),    # 绿色
    (255, 0, 0),    # 蓝色
    (0, 255, 255),  # 黄色
    (255, 0, 255),  # 紫色
    (0, 165, 255),  # 橙色
    (255, 255, 0),  # 青色
    (128, 0, 128),  # 深紫色
    (0, 128, 128),  # 深青色
    (128, 128, 0),  # 橄榄色
    (128, 0, 0),    # 深蓝色
    (0,0,0)
]

for frame_idx in range(total_frames):
    ret, frame = cap.read()
    if not ret:
        break
    
    result_frame = frame.copy()
    
    # 获取当前帧的mask
    if frame_idx in frame_masks:
        data = frame_masks[frame_idx]
        masks = data['out_binary_masks']  # shape: (num_objects, H, W)
        probs = data['out_probs']
        boxes = data.get('out_boxes_xywh', None)
        obj_ids = data.get('out_obj_ids', None)
        
        # 检查是否有有效的mask
        if masks is not None and len(masks) > 0 and masks.shape[0] > 0:
            num_objects = masks.shape[0]
            
            # 遍历所有检测到的目标
            for obj_idx in range(num_objects):
                # 获取当前目标的mask
                mask = masks[obj_idx]
                
                # 处理mask尺寸
                if mask.shape == (width, height):  # (1280, 720)
                    mask = mask.T  # 转为 (720, 1280)
                elif mask.shape != (height, width):
                    mask = cv2.resize(mask.astype(np.float32), (width, height),
                                     interpolation=cv2.INTER_NEAREST)
                
                # 确保是二值mask
                mask_bool = mask > 0.5
                
                if np.any(mask_bool):
                    # 选择颜色
                    color = colors[obj_idx % len(colors)]
                    
                    # 创建彩色半透明覆盖
                    overlay = result_frame.copy()
                    overlay[mask_bool] = color
                    result_frame = cv2.addWeighted(result_frame, 0.6, overlay, 0.4, 0)
                    
                    # 绘制轮廓 (使用相同颜色但更亮)
                    mask_uint8 = (mask_bool * 255).astype(np.uint8)
                    contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL,
                                                   cv2.CHAIN_APPROX_SIMPLE)
                    cv2.drawContours(result_frame, contours, -1, color, 2)
                    
                    # 计算mask的质心
                    M = cv2.moments(mask_uint8)
                    if M["m00"] != 0:
                        cx = int(M["m10"] / M["m00"])
                        cy = int(M["m01"] / M["m00"])
                        
                        # 在质心位置绘制ID
                        obj_id = obj_ids[obj_idx] if obj_ids is not None and obj_idx < len(obj_ids) else obj_idx
                        prob = probs[obj_idx] if probs is not None and obj_idx < len(probs) else 0
                        
                        # 创建背景矩形使文本更易读
                        text = f"ID:{obj_id}"
                        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)[0]
                        cv2.rectangle(result_frame, 
                                     (cx - text_size[0]//2 - 5, cy - text_size[1]//2 - 5),
                                     (cx + text_size[0]//2 + 5, cy + text_size[1]//2 + 5),
                                     (0, 0, 0), -1)  # 黑色背景
                        
                        # 绘制ID文本
                        cv2.putText(result_frame, text, (cx - text_size[0]//2, cy + text_size[1]//2),
                                   cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
                    
                    # 绘制边界框
                    if boxes is not None and obj_idx < len(boxes):
                        box = boxes[obj_idx]
                        x, y, w, h = box
                        cv2.rectangle(result_frame, (int(x), int(y)), 
                                     (int(x + w), int(y + h)), color, 2)
                        
                        # 在边界框上方显示置信度
                        label = f"P:{prob:.2f}"
                        cv2.putText(result_frame, label, (int(x), int(y) - 5),
                                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
            # 显示检测到的目标数量
            cv2.putText(result_frame, f"Objects: {num_objects}", (10, 60),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
        else:
            # 没有检测到目标，显示提示
            cv2.putText(result_frame, "No detection", (10, 60),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
    
    # 添加帧号
    cv2.putText(result_frame, f"Frame: {frame_idx}", (10, 30),
               cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    
    out.write(result_frame)
    
    if frame_idx % 50 == 0:
        print(f"  写入进度: {frame_idx}/{total_frames}")

cap.release()
out.release()

# ============ 7. 关闭会话 ============
video_predictor.handle_request(
    request=dict(type="close_session", session_id=session_id)
)

print(f"\n完成！输出保存到: {OUTPUT_VIDEO_PATH}")

# ============ 8. 创建预览GIF ============
print("\n创建预览GIF...")
cap = cv2.VideoCapture(OUTPUT_VIDEO_PATH)
gif_frames = []
for i in range(min(50, total_frames)):
    ret, frame = cap.read()
    if not ret:
        break
    frame_small = cv2.resize(frame, (320, 180))
    gif_frames.append(Image.fromarray(cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)))

if gif_frames:
    gif_frames[0].save("preview.gif", save_all=True, append_images=gif_frames[1:],
                       duration=100, loop=0)
    print("预览GIF已保存: preview.gif")

cap.release()
print("\n全部完成！")

实时外接摄像头：

import torch
import numpy as np
import cv2
from PIL import Image
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor
import time
from scipy.optimize import linear_sum_assignment

# ============ 配置参数 ============
TEXT_PROMPT = "bottle"
CHECKPOINT_PATH = "checkpoint/sam3.pt"
CAMERA_ID = 0

DISPLAY_WIDTH = 1280
DISPLAY_HEIGHT = 720

# 跟踪参数
IOU_THRESHOLD = 0.3  # IoU阈值，用于匹配实例
MAX_DISAPPEARED = 10  # 物体消失多少帧后删除ID

# ============ 实例跟踪器 ============
class InstanceTracker:
    """简单的实例跟踪器，基于IoU匹配"""
    
    def __init__(self, iou_threshold=0.3, max_disappeared=10):
        self.next_id = 0
        self.objects = {}  # {id: {"box": box, "disappeared": count}}
        self.iou_threshold = iou_threshold
        self.max_disappeared = max_disappeared
    
    def compute_iou(self, box1, box2):
        """计算两个框的IoU"""
        x1_min, y1_min, x1_max, y1_max = box1
        x2_min, y2_min, x2_max, y2_max = box2
        
        # 计算交集
        inter_x_min = max(x1_min, x2_min)
        inter_y_min = max(y1_min, y2_min)
        inter_x_max = min(x1_max, x2_max)
        inter_y_max = min(y1_max, y2_max)
        
        if inter_x_max < inter_x_min or inter_y_max < inter_y_min:
            return 0.0
        
        inter_area = (inter_x_max - inter_x_min) * (inter_y_max - inter_y_min)
        
        # 计算并集
        box1_area = (x1_max - x1_min) * (y1_max - y1_min)
        box2_area = (x2_max - x2_min) * (y2_max - y2_min)
        union_area = box1_area + box2_area - inter_area
        
        return inter_area / union_area if union_area > 0 else 0.0
    
    def update(self, boxes):
        """
        更新跟踪器
        boxes: 当前帧检测到的边界框 (N, 4) [x1, y1, x2, y2]
        返回: 分配的ID列表
        """
        if len(boxes) == 0:
            # 没有检测到物体，增加所有现有物体的消失计数
            for obj_id in list(self.objects.keys()):
                self.objects[obj_id]["disappeared"] += 1
                if self.objects[obj_id]["disappeared"] > self.max_disappeared:
                    del self.objects[obj_id]
            return []
        
        # 如果没有现有物体，为所有检测创建新ID
        if len(self.objects) == 0:
            ids = []
            for box in boxes:
                self.objects[self.next_id] = {
                    "box": box,
                    "disappeared": 0
                }
                ids.append(self.next_id)
                self.next_id += 1
            return ids
        
        # 计算IoU矩阵
        object_ids = list(self.objects.keys())
        iou_matrix = np.zeros((len(object_ids), len(boxes)))
        
        for i, obj_id in enumerate(object_ids):
            old_box = self.objects[obj_id]["box"]
            for j, new_box in enumerate(boxes):
                iou_matrix[i, j] = self.compute_iou(old_box, new_box)
        
        # 使用匈牙利算法进行最优匹配
        # 注意：linear_sum_assignment最小化成本，所以我们用1-IoU
        cost_matrix = 1 - iou_matrix
        row_indices, col_indices = linear_sum_assignment(cost_matrix)
        
        # 分配ID
        assigned_ids = [-1] * len(boxes)
        matched_objects = set()
        matched_detections = set()
        
        for row, col in zip(row_indices, col_indices):
            if iou_matrix[row, col] >= self.iou_threshold:
                obj_id = object_ids[row]
                assigned_ids[col] = obj_id
                self.objects[obj_id]["box"] = boxes[col]
                self.objects[obj_id]["disappeared"] = 0
                matched_objects.add(obj_id)
                matched_detections.add(col)
        
        # 为未匹配的检测创建新ID
        for i in range(len(boxes)):
            if i not in matched_detections:
                self.objects[self.next_id] = {
                    "box": boxes[i],
                    "disappeared": 0
                }
                assigned_ids[i] = self.next_id
                self.next_id += 1
        
        # 增加未匹配物体的消失计数
        for obj_id in object_ids:
            if obj_id not in matched_objects:
                self.objects[obj_id]["disappeared"] += 1
                if self.objects[obj_id]["disappeared"] > self.max_disappeared:
                    del self.objects[obj_id]
        
        return assigned_ids
    
    def reset(self):
        """重置跟踪器"""
        self.next_id = 0
        self.objects = {}

# ============ 1. 加载模型 ============
print("加载SAM3图像模型...")
model = build_sam3_image_model(checkpoint_path=CHECKPOINT_PATH)
processor = Sam3Processor(model)
print("✓ 模型加载完成！")

# ============ 2. 打开摄像头 ============
print(f"打开摄像头 {CAMERA_ID}...")
cap = cv2.VideoCapture(CAMERA_ID)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, DISPLAY_WIDTH)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, DISPLAY_HEIGHT)

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"✓ 摄像头分辨率: {width}x{height}")

# ============ 3. 初始化 ============
tracker = InstanceTracker(iou_threshold=IOU_THRESHOLD, max_disappeared=MAX_DISAPPEARED)

colors = [
    (0, 255, 0), (255, 0, 0), (0, 255, 255), (255, 0, 255),
    (0, 165, 255), (255, 255, 0), (128, 0, 128), (0, 128, 128),
    (255, 128, 0), (128, 255, 0), (0, 128, 255), (255, 0, 128),
]

is_tracking = False
click_points = []
click_labels = []
click_mode = False
current_prompt = TEXT_PROMPT
use_text = True

fps_start = time.time()
fps_count = 0
display_fps = 0

# ============ 鼠标回调 ============
def mouse_callback(event, x, y, flags, param):
    global click_points, click_labels
    
    if not click_mode:
        return
    
    if event == cv2.EVENT_LBUTTONDOWN:
        click_points.append([x, y])
        click_labels.append(1)
        print(f"✓ 前景点: ({x}, {y})")
    elif event == cv2.EVENT_RBUTTONDOWN:
        click_points.append([x, y])
        click_labels.append(0)
        print(f"✓ 背景点: ({x}, {y})")

cv2.namedWindow('SAM3 with Instance Tracking')
cv2.setMouseCallback('SAM3 with Instance Tracking', mouse_callback)

# ============ 辅助函数 ============
def process_frame_with_text(frame, text_prompt):
    """使用文本提示处理单帧"""
    try:
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)
        
        inference_state = processor.set_image(pil_image)
        output = processor.set_text_prompt(state=inference_state, prompt=text_prompt)
        
        masks = output["masks"].cpu().numpy()
        boxes = output["boxes"].cpu().numpy()
        scores = output["scores"].cpu().numpy()
        
        return masks, boxes, scores
    except Exception as e:
        print(f"处理失败: {e}")
        return None, None, None

def process_frame_with_points(frame, points, labels):
    """使用点提示处理单帧"""
    try:
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)
        
        inference_state = processor.set_image(pil_image)
        output = processor.set_point_prompt(
            state=inference_state,
            points=np.array(points),
            labels=np.array(labels)
        )
        
        masks = output["masks"].cpu().numpy()
        boxes = output["boxes"].cpu().numpy()
        scores = output["scores"].cpu().numpy()
        
        return masks, boxes, scores
    except Exception as e:
        print(f"处理失败: {e}")
        return None, None, None

def draw_results(frame, masks, boxes, scores, instance_ids):
    """绘制分割结果（带实例ID）"""
    if masks is None or len(masks) == 0:
        return frame
    
    result = frame.copy()
    
    for i, (mask, box, score, inst_id) in enumerate(zip(masks, boxes, scores, instance_ids)):
        # 处理mask维度
        if len(mask.shape) == 3:
            mask = mask[0]
        
        # 调整mask尺寸
        if mask.shape != (height, width):
            mask = cv2.resize(mask.astype(np.float32), (width, height),
                            interpolation=cv2.INTER_NEAREST)
        
        mask_bool = mask > 0.5
        if not np.any(mask_bool):
            continue
        
        # 使用实例ID选择颜色（确保同一ID颜色一致）
        color = colors[inst_id % len(colors)]
        
        # 半透明填充
        overlay = result.copy()
        overlay[mask_bool] = color
        result = cv2.addWeighted(result, 0.65, overlay, 0.35, 0)
        
        # 轮廓
        mask_uint8 = (mask_bool * 255).astype(np.uint8)
        contours, _ = cv2.findContours(mask_uint8, cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_SIMPLE)
        cv2.drawContours(result, contours, -1, color, 3)
        
        # 边界框
        x1, y1, x2, y2 = box.astype(int)
        cv2.rectangle(result, (x1, y1), (x2, y2), color, 3)
        
        # 计算质心
        M = cv2.moments(mask_uint8)
        if M["m00"] != 0:
            cx = int(M["m10"] / M["m00"])
            cy = int(M["m01"] / M["m00"])
            
            # 显示ID（大字体，黑色背景）
            id_text = f"ID:{inst_id}"
            text_size = cv2.getTextSize(id_text, cv2.FONT_HERSHEY_SIMPLEX, 1.2, 3)[0]
            
            # 黑色背景
            cv2.rectangle(result,
                         (cx - text_size[0]//2 - 8, cy - text_size[1]//2 - 8),
                         (cx + text_size[0]//2 + 8, cy + text_size[1]//2 + 8),
                         (0, 0, 0), -1)
            
            # ID文本
            cv2.putText(result, id_text, 
                       (cx - text_size[0]//2, cy + text_size[1]//2),
                       cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
        
        # 在边界框上方显示分数
        label = f"Score:{score:.2f}"
        cv2.putText(result, label, (x1, y1 - 10),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
    
    return result

# ============ 主循环 ============
print("\n" + "="*70)
print("SAM3 图像模式 + 实例ID跟踪")
print("="*70)
print("特点: 每帧独立检测 + 跨帧ID匹配")
print()
print("操作:")
print("  [T] - 使用文本提示")
print("  [C] - 使用点击提示")
print("  [Space] - 开始/停止跟踪")
print("  [R] - 重置（包括ID）")
print("  [Q] - 退出")
print("="*70)
print()

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # FPS计算
        fps_count += 1
        if fps_count % 30 == 0:
            display_fps = 30 / (time.time() - fps_start)
            fps_start = time.time()
        
        result_frame = frame.copy()
        
        # 跟踪模式
        if is_tracking:
            process_start = time.time()
            
            if use_text:
                masks, boxes, scores = process_frame_with_text(frame, current_prompt)
            else:
                masks, boxes, scores = process_frame_with_points(frame, click_points, click_labels)
            
            process_time = time.time() - process_start
            
            if masks is not None and len(masks) > 0:
                # 更新跟踪器，获取实例ID
                instance_ids = tracker.update(boxes)
                
                # 绘制结果
                result_frame = draw_results(frame, masks, boxes, scores, instance_ids)
                
                cv2.putText(result_frame, f"Tracking: {len(masks)} objects",
                           (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
                cv2.putText(result_frame, f"Process: {process_time*1000:.1f}ms",
                           (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
                cv2.putText(result_frame, f"Active IDs: {len(tracker.objects)}",
                           (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
            else:
                # 没有检测到物体，更新跟踪器
                tracker.update([])
                cv2.putText(result_frame, "No objects detected",
                           (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
        
        # 点击模式
        elif click_mode:
            for i, pt in enumerate(click_points):
                color = (0, 255, 0) if click_labels[i] == 1 else (0, 0, 255)
                cv2.circle(result_frame, tuple(pt), 6, color, -1)
                cv2.circle(result_frame, tuple(pt), 8, (255, 255, 255), 2)
            
            cv2.putText(result_frame, f"Click mode: {len(click_points)} points",
                       (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
            cv2.putText(result_frame, "Press SPACE to start",
                       (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
        
        # 待命模式
        else:
            cv2.putText(result_frame, "Press [T] for text or [C] for click",
                       (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        
        # FPS显示
        cv2.putText(result_frame, f"FPS: {display_fps:.1f}",
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
        
        cv2.imshow('SAM3 with Instance Tracking', result_frame)
        
        # 按键处理
        key = cv2.waitKey(1) & 0xFF
        
        if key == ord('q'):
            break
        
        elif key == ord('t'):
            print("\n输入文本提示:")
            prompt = input("> ").strip()
            if prompt:
                current_prompt = prompt
                use_text = True
                click_mode = False
                print(f"✓ 文本提示: '{prompt}'")
        
        elif key == ord('c'):
            click_mode = True
            click_points = []
            click_labels = []
            use_text = False
            is_tracking = False
            print("\n点击模式 - 左键=前景，右键=背景")
        
        elif key == ord(' '):
            if click_mode and len(click_points) > 0:
                click_mode = False
                is_tracking = True
                tracker.reset()  # 重置跟踪器
                print(f"\n✓ 开始跟踪 ({len(click_points)} 个点)")
            elif use_text:
                is_tracking = not is_tracking
                if is_tracking:
                    tracker.reset()  # 重置跟踪器
                    print(f"\n✓ 开始跟踪 ('{current_prompt}')")
                else:
                    print("\n⏸ 暂停跟踪")
        
        elif key == ord('r'):
            print("\n重置...")
            is_tracking = False
            click_mode = False
            click_points = []
            click_labels = []
            use_text = True
            tracker.reset()
            print("✓ 重置完成（ID已清零）")

except KeyboardInterrupt:
    print("\n用户中断")
except Exception as e:
    print(f"\n错误: {e}")
    import traceback
    traceback.print_exc()

finally:
    cap.release()
    cv2.destroyAllWindows()
    print("✓ 完成！")

--------------------------------------------------------------------------------------------------------------------------------

无prompt的所有mask分割

核心思路是：在图像上生成均匀的点网格，用这些点作为 prompt 来触发分割（这是 SAM1 的 AutomaticMaskGenerator 原理）。

import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

from sam3.model_builder import build_tracker, _create_vision_backbone, download_ckpt_from_hf
from sam3.model.sam1_task_predictor import SAM3InteractiveImagePredictor
from sam3.model.vl_combiner import SAM3VLBackbone

# ========== 构建预测器 ==========
def build_sam3_point_predictor(checkpoint_path=None, device="cuda"):
    print("🔧 构建SAM3点prompt预测器...")
    
    vision_backbone = _create_vision_backbone(enable_inst_interactivity=True)
    backbone = SAM3VLBackbone(scalp=1, visual=vision_backbone, text=None)
    
    tracker = build_tracker(apply_temporal_disambiguation=False, with_backbone=False)
    tracker.backbone = backbone
    
    predictor = SAM3InteractiveImagePredictor(tracker)
    
    if checkpoint_path is None:
        checkpoint_path = download_ckpt_from_hf()
    
    ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
    if "model" in ckpt:
        ckpt = ckpt["model"]
    
    tracker_ckpt = {k.replace("tracker.", ""): v for k, v in ckpt.items() if "tracker" in k}
    tracker.load_state_dict(tracker_ckpt, strict=False)
    
    backbone_ckpt = {}
    for k, v in ckpt.items():
        if "detector.backbone.vision_backbone" in k or "detector.backbone.visual" in k:
            new_key = k.replace("detector.backbone.", "")
            backbone_ckpt[new_key] = v
    if backbone_ckpt:
        backbone.load_state_dict(backbone_ckpt, strict=False)
    
    predictor = predictor.to(device)
    predictor.eval()
    
    print("✅ 预测器构建完成!")
    return predictor


# ========== 自动Mask生成器 ==========
class Sam3AutomaticMaskGenerator:
    def __init__(self, predictor,
                 points_per_side: int = 32,
                 pred_iou_thresh: float = 0.7,
                 stability_score_thresh: float = 0.8,
                 min_mask_region_area: int = 100,
                 max_mask_region_ratio: float = 0.8,
                 nms_thresh: float = 0.7):
        self.predictor = predictor
        self.points_per_side = points_per_side
        self.pred_iou_thresh = pred_iou_thresh
        self.stability_score_thresh = stability_score_thresh
        self.min_mask_region_area = min_mask_region_area
        self.max_mask_region_ratio = max_mask_region_ratio
        self.nms_thresh = nms_thresh
    
    def _generate_point_grid(self, img_height, img_width):
        x = np.linspace(0, img_width, self.points_per_side + 2)[1:-1]
        y = np.linspace(0, img_height, self.points_per_side + 2)[1:-1]
        xx, yy = np.meshgrid(x, y)
        return np.stack([xx.flatten(), yy.flatten()], axis=1)
    
    def _compute_iou(self, mask1, mask2):
        intersection = np.logical_and(mask1, mask2).sum()
        union = np.logical_or(mask1, mask2).sum()
        return intersection / (union + 1e-8)
    
    def _nms_masks(self, masks, scores):
        if len(masks) == 0:
            return [], []
        indices = np.argsort(scores)[::-1]
        keep_masks, keep_scores = [], []
        while len(indices) > 0:
            idx = indices[0]
            keep_masks.append(masks[idx])
            keep_scores.append(scores[idx])
            remaining = [i for i in indices[1:] if self._compute_iou(masks[idx], masks[i]) < self.nms_thresh]
            indices = remaining
        return keep_masks, keep_scores
    
    def _process_mask(self, mask):
        """将logits转换为二值mask"""
        if mask.min() < 0 or mask.max() > 1:
            mask_sigmoid = 1 / (1 + np.exp(-mask.astype(np.float32).clip(-50, 50)))
        else:
            mask_sigmoid = mask
        return (mask_sigmoid > 0.5).astype(bool)
    
    def _compute_stability_score(self, logits, threshold_offset=1.0):
        high_thresh = (logits > threshold_offset).sum()
        low_thresh = (logits > -threshold_offset).sum()
        return float(high_thresh / (low_thresh + 1e-8))
    
    def generate(self, image):
        if isinstance(image, Image.Image):
            image_np = np.array(image)
        else:
            image_np = image
        
        img_height, img_width = image_np.shape[:2]
        total_pixels = img_height * img_width
        
        self.predictor.set_image(image_np)
        
        points = self._generate_point_grid(img_height, img_width)
        print(f"🔍 使用 {len(points)} 个采样点进行自动分割...")
        
        all_masks, all_scores = [], []
        
        for i, point in enumerate(points):
            try:
                masks, scores, logits = self.predictor.predict(
                    point_coords=np.array([[point[0], point[1]]]),
                    point_labels=np.array([1]),
                    multimask_output=True,
                    return_logits=True
                )
                
                # 选择分数最高的mask
                best_idx = np.argmax(scores)
                mask = masks[best_idx]
                score = scores[best_idx]
                logit = logits[best_idx]
                
                if score < self.pred_iou_thresh:
                    continue
                
                binary_mask = self._process_mask(mask)
                
                stability = self._compute_stability_score(logit)
                if stability < self.stability_score_thresh:
                    continue
                
                mask_area = binary_mask.sum()
                if mask_area < self.min_mask_region_area:
                    continue
                if mask_area > self.max_mask_region_ratio * total_pixels:
                    continue
                
                all_masks.append(binary_mask)
                all_scores.append(float(score))
                    
            except Exception as e:
                if i == 0:
                    print(f"   ⚠️ 错误: {e}")
                continue
        
        print(f"   收集到 {len(all_masks)} 个候选masks")
        
        if len(all_masks) == 0:
            return []
        
        final_masks, final_scores = self._nms_masks(all_masks, all_scores)
        print(f"   NMS后保留 {len(final_masks)} 个masks")
        
        results = []
        for mask, score in zip(final_masks, final_scores):
            rows = np.any(mask, axis=1)
            cols = np.any(mask, axis=0)
            if not np.any(rows) or not np.any(cols):
                continue
            y1, y2 = np.where(rows)[0][[0, -1]]
            x1, x2 = np.where(cols)[0][[0, -1]]
            
            results.append({
                'segmentation': mask,
                'area': int(mask.sum()),
                'bbox': [int(x1), int(y1), int(x2-x1), int(y2-y1)],
                'predicted_iou': score,
                'box_xyxy': [int(x1), int(y1), int(x2), int(y2)]
            })
        
        results.sort(key=lambda x: x['area'], reverse=True)
        return results


# ========== 可视化函数 ==========
def visualize_instances(image, results, save_path="instance_segmentation.png"):
    """可视化实例分割结果，每个实例不同颜色"""
    if isinstance(image, Image.Image):
        image_np = np.array(image)
    else:
        image_np = image.copy()
    
    h, w = image_np.shape[:2]
    n_instances = len(results)
    
    if n_instances == 0:
        print("没有检测到实例")
        return
    
    # 为每个实例生成不同的颜色
    colors = plt.cm.tab20(np.linspace(0, 1, max(n_instances, 20)))[:n_instances]
    
    # 创建彩色实例图
    instance_overlay = image_np.copy().astype(np.float64)
    instance_map = np.zeros((h, w), dtype=np.int32)  # 实例ID图
    
    for i, result in enumerate(results):
        mask = result['segmentation'].astype(bool)
        color = colors[i][:3]  # RGB
        
        # 叠加颜色
        for c in range(3):
            instance_overlay[:,:,c][mask] = instance_overlay[:,:,c][mask] * 0.4 + color[c] * 255 * 0.6
        
        # 记录实例ID (从1开始)
        instance_map[mask] = i + 1
    
    # 可视化
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    
    # 1. 原图
    axes[0, 0].imshow(image_np)
    axes[0, 0].set_title("Original Image", fontsize=14)
    axes[0, 0].axis('off')
    
    # 2. 实例分割（彩色叠加）
    axes[0, 1].imshow(instance_overlay.astype(np.uint8))
    axes[0, 1].set_title(f"Instance Segmentation ({n_instances} instances)", fontsize=14)
    axes[0, 1].axis('off')
    
    # 3. 实例ID图（每个实例不同颜色的纯mask）
    # 创建彩色mask图（无原图背景）
    colored_mask = np.zeros((h, w, 3), dtype=np.uint8)
    for i, result in enumerate(results):
        mask = result['segmentation'].astype(bool)
        color = (colors[i][:3] * 255).astype(np.uint8)
        colored_mask[mask] = color
    
    axes[1, 0].imshow(colored_mask)
    axes[1, 0].set_title("Instance Masks (colored)", fontsize=14)
    axes[1, 0].axis('off')
    
    # 4. 带边界框和标签
    axes[1, 1].imshow(image_np)
    for i, result in enumerate(results):
        box = result['box_xyxy']
        score = result['predicted_iou']
        color = colors[i][:3]
        
        x1, y1, x2, y2 = box
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, 
                             fill=False, color=color, linewidth=2)
        axes[1, 1].add_patch(rect)
        axes[1, 1].text(x1, y1-3, f"{i+1}", fontsize=10, fontweight='bold',
                       color='white', bbox=dict(boxstyle="round,pad=0.2", fc=color, alpha=0.8))
    axes[1, 1].set_title("Bounding Boxes", fontsize=14)
    axes[1, 1].axis('off')
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    print(f"✅ 结果保存到 {save_path}")
    plt.show()
    
    # 打印每个实例的信息
    print(f"\n📊 检测到 {n_instances} 个实例:")
    for i, result in enumerate(results):
        print(f"   实例 {i+1}: score={result['predicted_iou']:.3f}, area={result['area']}")
    
    return instance_overlay, colored_mask


# ========== 主程序 ==========
print("=" * 60)
print("SAM3 自动实例分割")
print("=" * 60)

# 构建预测器
predictor = build_sam3_point_predictor(
    checkpoint_path="checkpoint/sam3.pt",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# 创建生成器
mask_generator = Sam3AutomaticMaskGenerator(
    predictor,
    points_per_side=16,
    pred_iou_thresh=0.7,
    stability_score_thresh=0.7,
    min_mask_region_area=1000,
    max_mask_region_ratio=0.5,   # 过滤掉超过50%图像的大mask（背景）
    nms_thresh=0.5
)

# 加载图像
image = Image.open(r"D:\vscode\python_project\sam3-main\cat3.png")

# 生成实例分割
results = mask_generator.generate(image)

print(f"\n✅ 找到 {len(results)} 个实例")

# 可视化
if len(results) > 0:
    visualize_instances(image, results, "instance_segmentation.png")
    
    # 保存每个实例的单独mask
    for i, result in enumerate(results):
        mask = result['segmentation']
        mask_img = Image.fromarray((mask * 255).astype(np.uint8))
        mask_img.save(f"instance_{i+1}_mask.png")
    print(f"\n✅ 单独mask已保存: instance_1_mask.png ~ instance_{len(results)}_mask.png")
else:
    print("未检测到任何实例")

核心改动：

原代码修改后

processor.set_text_prompt(prompt="...")

auto_generator.generate(image, inference_state)

目前来说SAM3的效果是真牛，开放性词汇分割，我觉得真的是玩儿的太牛了。期待后续各个大佬们的发散创造。

期待各位朋友们有没有什么想法之类的，欢迎评论区讨论，未来的分割方向如何。

个人感觉分割，目标检测，追踪马上就算是一种任务了

----------------------------------------------------------------------------------

续上面的内容：

对于实例分割目标检测的结果性能查看：

ID混淆是追踪任务的痛点，SAM3目前大幅度改善了追踪任务的ID混淆

sam3权重下载地址：

通过网盘分享的文件：权重
链接: https://pan.baidu.com/s/1vG7_meIz_3YXwruxX5Fzbw?pwd=1111 提取码: 1111
--来自百度网盘超级会员v4的分享

转载自CSDN-专业IT技术社区

原文链接：https://blog.csdn.net/qq_58949158/article/details/155102282