Skip to content

Computer Vision

franklinic edited this page Jan 19, 2026 · 1 revision

Computer Vision

AiDotNet provides 50+ computer vision models for object detection, image classification, segmentation, and more.

Object Detection

YOLO (You Only Look Once)

AiDotNet supports YOLO v8 through v11 for real-time object detection.

using AiDotNet.ComputerVision;

// Load YOLOv8 model
var detector = await YOLO<float>.LoadAsync(
    version: YOLOVersion.V8,
    size: YOLOSize.Nano,  // n, s, m, l, x
    device: Device.GPU);

// Detect objects in image
var image = await Image.LoadAsync("photo.jpg");
var detections = detector.Detect(image, confidenceThreshold: 0.5);

foreach (var det in detections)
{
    Console.WriteLine($"{det.ClassName}: {det.Confidence:P1}");
    Console.WriteLine($"  Box: ({det.X}, {det.Y}, {det.Width}, {det.Height})");
}

YOLO Variants

Version Speed Accuracy Best For
YOLOv8n Fastest Good Edge devices, real-time
YOLOv8s Fast Better Balanced
YOLOv8m Medium High General use
YOLOv8l Slower Higher Accuracy focused
YOLOv8x Slowest Highest Maximum accuracy
YOLOv11 Fast State-of-art Latest architecture

DETR (Detection Transformer)

var detector = await DETR<float>.LoadAsync(
    backbone: "resnet50",
    numQueries: 100,
    pretrained: true);

var detections = detector.Detect(image);

Faster R-CNN

var detector = await FasterRCNN<float>.LoadAsync(
    backbone: FasterRCNNBackbone.ResNet50FPN,
    pretrained: true);

var detections = detector.Detect(image, nmsThreshold: 0.5);

Training Custom Object Detector

var result = await new PredictionModelBuilder<float, Tensor<float>, Detection[]>()
    .WithTrainingData(images, annotations)  // COCO or Pascal VOC format
    .WithObjectDetector(det => det
        .UseYOLO(YOLOVersion.V8, YOLOSize.Medium)
        .AugmentData(aug => aug
            .RandomHorizontalFlip()
            .RandomScale(0.8, 1.2)
            .ColorJitter()))
    .WithOptimizer(OptimizerType.AdamW, learningRate: 1e-4)
    .WithEpochs(100)
    .ConfigureGpu(gpu => gpu.Enabled = true)
    .BuildAsync();

// Save trained model
await result.Model.SaveAsync("custom_detector.aidotnet");

Image Classification

Using Pre-trained Models

// ResNet
var classifier = await ResNet<float>.LoadAsync(
    variant: ResNetVariant.ResNet50,
    pretrained: true,
    numClasses: 1000);

var predictions = classifier.Classify(image, topK: 5);

// EfficientNet
var classifier = await EfficientNet<float>.LoadAsync(
    variant: EfficientNetVariant.B4,
    pretrained: true);

// Vision Transformer
var classifier = await ViT<float>.LoadAsync(
    variant: ViTVariant.Base16,
    pretrained: true);

Fine-tuning for Custom Classes

var result = await new PredictionModelBuilder<float, Tensor<float>, int>()
    .WithTrainingData(images, labels)
    .WithNeuralNetwork(nn => nn
        .UseEfficientNet(EfficientNetVariant.B0, pretrained: true)
        .FreezeBackbone()  // Only train the head
        .ReplaceHead(numClasses: 10))
    .WithOptimizer(OptimizerType.Adam, learningRate: 1e-4)
    .WithEpochs(20)
    .WithDataAugmentation(aug => aug
        .RandomCrop(224, 224)
        .RandomHorizontalFlip()
        .Normalize(ImageNet.Mean, ImageNet.Std))
    .BuildAsync();

Available Classification Models

Model Parameters ImageNet Acc Speed
ResNet18 11M 69.8% Fast
ResNet50 25M 76.1% Medium
ResNet152 60M 78.3% Slow
EfficientNet-B0 5.3M 77.1% Fast
EfficientNet-B4 19M 82.9% Medium
EfficientNet-B7 66M 84.3% Slow
ViT-Base/16 86M 81.8% Medium
ViT-Large/16 304M 85.2% Slow
ConvNeXt-T 29M 82.1% Medium
ConvNeXt-L 198M 84.3% Slow

Image Segmentation

Semantic Segmentation

// DeepLabV3+
var segmenter = await DeepLabV3<float>.LoadAsync(
    backbone: "resnet101",
    numClasses: 21,  // Pascal VOC classes
    pretrained: true);

var segmentationMask = segmenter.Segment(image);
// Returns tensor of shape [H, W] with class indices

// U-Net
var segmenter = await UNet<float>.LoadAsync(
    encoderName: "resnet34",
    numClasses: 2);  // Binary segmentation

Instance Segmentation

// Mask R-CNN
var segmenter = await MaskRCNN<float>.LoadAsync(
    backbone: MaskRCNNBackbone.ResNet50FPN,
    pretrained: true);

var instances = segmenter.Segment(image);

foreach (var inst in instances)
{
    Console.WriteLine($"{inst.ClassName}: {inst.Confidence:P1}");
    Console.WriteLine($"  BBox: {inst.BoundingBox}");
    Console.WriteLine($"  Mask pixels: {inst.Mask.Sum()}");
}

Panoptic Segmentation

var segmenter = await PanopticFPN<float>.LoadAsync(pretrained: true);

var result = segmenter.Segment(image);
// result.SemanticMask - background classes
// result.InstanceMasks - individual object masks

Segment Anything Model (SAM)

var sam = await SAM<float>.LoadAsync(
    variant: SAMVariant.ViT_H,  // H, L, or B
    device: Device.GPU);

// Segment with point prompt
var masks = sam.Segment(image, points: new[] { (512, 384) });

// Segment with box prompt
var masks = sam.Segment(image, box: (100, 100, 300, 300));

// Auto-mask everything
var allMasks = sam.GenerateMasks(image);

OCR (Optical Character Recognition)

Text Detection

var detector = await TextDetector<float>.LoadAsync(
    model: TextDetectionModel.CRAFT);

var textRegions = detector.Detect(image);

foreach (var region in textRegions)
{
    Console.WriteLine($"Text region at: {region.BoundingBox}");
    Console.WriteLine($"  Confidence: {region.Confidence:P1}");
}

Text Recognition

var recognizer = await TextRecognizer<float>.LoadAsync(
    model: TextRecognitionModel.CRNN);

var text = recognizer.Recognize(textImage);
Console.WriteLine($"Recognized: {text}");

End-to-End OCR

var ocr = await OCRPipeline<float>.LoadAsync();

var results = ocr.ReadText(image);

foreach (var result in results)
{
    Console.WriteLine($"'{result.Text}' at {result.BoundingBox}");
}

Face Detection and Recognition

Face Detection

var detector = await FaceDetector<float>.LoadAsync(
    model: FaceDetectionModel.RetinaFace);

var faces = detector.Detect(image);

foreach (var face in faces)
{
    Console.WriteLine($"Face at {face.BoundingBox}");
    Console.WriteLine($"  Landmarks: {face.Landmarks.Count}");
}

Face Recognition

var recognizer = await FaceRecognizer<float>.LoadAsync(
    model: FaceRecognitionModel.ArcFace);

// Get face embedding
var embedding = recognizer.GetEmbedding(faceImage);

// Compare faces
var similarity = recognizer.Compare(face1, face2);
Console.WriteLine($"Similarity: {similarity:F4}");  // 0-1, higher = more similar

Pose Estimation

var poseEstimator = await PoseEstimator<float>.LoadAsync(
    model: PoseModel.HRNet);

var poses = poseEstimator.Estimate(image);

foreach (var pose in poses)
{
    foreach (var keypoint in pose.Keypoints)
    {
        Console.WriteLine($"{keypoint.Name}: ({keypoint.X}, {keypoint.Y}) conf={keypoint.Confidence:F2}");
    }
}

Image Generation

Stable Diffusion

var diffusion = await StableDiffusion<float>.LoadAsync(
    version: StableDiffusionVersion.V2_1);

var image = await diffusion.GenerateAsync(
    prompt: "a beautiful sunset over mountains, photorealistic",
    negativePrompt: "blurry, low quality",
    width: 512,
    height: 512,
    numInferenceSteps: 50,
    guidanceScale: 7.5);

await image.SaveAsync("generated.png");

Image-to-Image

var result = await diffusion.Img2ImgAsync(
    image: inputImage,
    prompt: "convert to oil painting style",
    strength: 0.7);

Inpainting

var result = await diffusion.InpaintAsync(
    image: inputImage,
    mask: maskImage,  // White = area to inpaint
    prompt: "a red sports car");

Data Augmentation

var augmentation = new ImageAugmentation()
    // Geometric transforms
    .RandomHorizontalFlip(probability: 0.5)
    .RandomVerticalFlip(probability: 0.1)
    .RandomRotation(degrees: 15)
    .RandomAffine(translate: (0.1, 0.1), scale: (0.9, 1.1))
    .RandomCrop(224, 224)
    .RandomResizedCrop(224, scale: (0.8, 1.0))

    // Color transforms
    .ColorJitter(brightness: 0.2, contrast: 0.2, saturation: 0.2, hue: 0.1)
    .RandomGrayscale(probability: 0.1)
    .GaussianBlur(kernelSize: 3)

    // Normalization
    .Normalize(mean: ImageNet.Mean, std: ImageNet.Std);

// Apply to dataset
var augmentedImage = augmentation.Apply(image);

Image Preprocessing

// Load and preprocess image
var image = await Image.LoadAsync("photo.jpg");

// Resize
var resized = image.Resize(224, 224, InterpolationMode.Bilinear);

// Convert to tensor
var tensor = resized.ToTensor();  // [C, H, W] normalized to [0, 1]

// Normalize for ImageNet
var normalized = tensor.Normalize(
    mean: new[] { 0.485f, 0.456f, 0.406f },
    std: new[] { 0.229f, 0.224f, 0.225f });

// Add batch dimension
var batch = normalized.Unsqueeze(0);  // [1, C, H, W]

Model Export

ONNX Export

// Export to ONNX for deployment
await model.ExportOnnxAsync("model.onnx", inputShape: new[] { 1, 3, 224, 224 });

TensorRT Optimization

var optimized = await model.OptimizeForTensorRTAsync(
    precision: Precision.FP16,
    batchSize: 8);

Next Steps

Clone this wiki locally