-
-
Notifications
You must be signed in to change notification settings - Fork 9
Computer Vision
franklinic edited this page Jan 19, 2026
·
1 revision
AiDotNet provides 50+ computer vision models for object detection, image classification, segmentation, and more.
AiDotNet supports YOLO v8 through v11 for real-time object detection.
using AiDotNet.ComputerVision;
// Load YOLOv8 model
var detector = await YOLO<float>.LoadAsync(
version: YOLOVersion.V8,
size: YOLOSize.Nano, // n, s, m, l, x
device: Device.GPU);
// Detect objects in image
var image = await Image.LoadAsync("photo.jpg");
var detections = detector.Detect(image, confidenceThreshold: 0.5);
foreach (var det in detections)
{
Console.WriteLine($"{det.ClassName}: {det.Confidence:P1}");
Console.WriteLine($" Box: ({det.X}, {det.Y}, {det.Width}, {det.Height})");
}| Version | Speed | Accuracy | Best For |
|---|---|---|---|
| YOLOv8n | Fastest | Good | Edge devices, real-time |
| YOLOv8s | Fast | Better | Balanced |
| YOLOv8m | Medium | High | General use |
| YOLOv8l | Slower | Higher | Accuracy focused |
| YOLOv8x | Slowest | Highest | Maximum accuracy |
| YOLOv11 | Fast | State-of-art | Latest architecture |
var detector = await DETR<float>.LoadAsync(
backbone: "resnet50",
numQueries: 100,
pretrained: true);
var detections = detector.Detect(image);var detector = await FasterRCNN<float>.LoadAsync(
backbone: FasterRCNNBackbone.ResNet50FPN,
pretrained: true);
var detections = detector.Detect(image, nmsThreshold: 0.5);var result = await new PredictionModelBuilder<float, Tensor<float>, Detection[]>()
.WithTrainingData(images, annotations) // COCO or Pascal VOC format
.WithObjectDetector(det => det
.UseYOLO(YOLOVersion.V8, YOLOSize.Medium)
.AugmentData(aug => aug
.RandomHorizontalFlip()
.RandomScale(0.8, 1.2)
.ColorJitter()))
.WithOptimizer(OptimizerType.AdamW, learningRate: 1e-4)
.WithEpochs(100)
.ConfigureGpu(gpu => gpu.Enabled = true)
.BuildAsync();
// Save trained model
await result.Model.SaveAsync("custom_detector.aidotnet");// ResNet
var classifier = await ResNet<float>.LoadAsync(
variant: ResNetVariant.ResNet50,
pretrained: true,
numClasses: 1000);
var predictions = classifier.Classify(image, topK: 5);
// EfficientNet
var classifier = await EfficientNet<float>.LoadAsync(
variant: EfficientNetVariant.B4,
pretrained: true);
// Vision Transformer
var classifier = await ViT<float>.LoadAsync(
variant: ViTVariant.Base16,
pretrained: true);var result = await new PredictionModelBuilder<float, Tensor<float>, int>()
.WithTrainingData(images, labels)
.WithNeuralNetwork(nn => nn
.UseEfficientNet(EfficientNetVariant.B0, pretrained: true)
.FreezeBackbone() // Only train the head
.ReplaceHead(numClasses: 10))
.WithOptimizer(OptimizerType.Adam, learningRate: 1e-4)
.WithEpochs(20)
.WithDataAugmentation(aug => aug
.RandomCrop(224, 224)
.RandomHorizontalFlip()
.Normalize(ImageNet.Mean, ImageNet.Std))
.BuildAsync();| Model | Parameters | ImageNet Acc | Speed |
|---|---|---|---|
| ResNet18 | 11M | 69.8% | Fast |
| ResNet50 | 25M | 76.1% | Medium |
| ResNet152 | 60M | 78.3% | Slow |
| EfficientNet-B0 | 5.3M | 77.1% | Fast |
| EfficientNet-B4 | 19M | 82.9% | Medium |
| EfficientNet-B7 | 66M | 84.3% | Slow |
| ViT-Base/16 | 86M | 81.8% | Medium |
| ViT-Large/16 | 304M | 85.2% | Slow |
| ConvNeXt-T | 29M | 82.1% | Medium |
| ConvNeXt-L | 198M | 84.3% | Slow |
// DeepLabV3+
var segmenter = await DeepLabV3<float>.LoadAsync(
backbone: "resnet101",
numClasses: 21, // Pascal VOC classes
pretrained: true);
var segmentationMask = segmenter.Segment(image);
// Returns tensor of shape [H, W] with class indices
// U-Net
var segmenter = await UNet<float>.LoadAsync(
encoderName: "resnet34",
numClasses: 2); // Binary segmentation// Mask R-CNN
var segmenter = await MaskRCNN<float>.LoadAsync(
backbone: MaskRCNNBackbone.ResNet50FPN,
pretrained: true);
var instances = segmenter.Segment(image);
foreach (var inst in instances)
{
Console.WriteLine($"{inst.ClassName}: {inst.Confidence:P1}");
Console.WriteLine($" BBox: {inst.BoundingBox}");
Console.WriteLine($" Mask pixels: {inst.Mask.Sum()}");
}var segmenter = await PanopticFPN<float>.LoadAsync(pretrained: true);
var result = segmenter.Segment(image);
// result.SemanticMask - background classes
// result.InstanceMasks - individual object masksvar sam = await SAM<float>.LoadAsync(
variant: SAMVariant.ViT_H, // H, L, or B
device: Device.GPU);
// Segment with point prompt
var masks = sam.Segment(image, points: new[] { (512, 384) });
// Segment with box prompt
var masks = sam.Segment(image, box: (100, 100, 300, 300));
// Auto-mask everything
var allMasks = sam.GenerateMasks(image);var detector = await TextDetector<float>.LoadAsync(
model: TextDetectionModel.CRAFT);
var textRegions = detector.Detect(image);
foreach (var region in textRegions)
{
Console.WriteLine($"Text region at: {region.BoundingBox}");
Console.WriteLine($" Confidence: {region.Confidence:P1}");
}var recognizer = await TextRecognizer<float>.LoadAsync(
model: TextRecognitionModel.CRNN);
var text = recognizer.Recognize(textImage);
Console.WriteLine($"Recognized: {text}");var ocr = await OCRPipeline<float>.LoadAsync();
var results = ocr.ReadText(image);
foreach (var result in results)
{
Console.WriteLine($"'{result.Text}' at {result.BoundingBox}");
}var detector = await FaceDetector<float>.LoadAsync(
model: FaceDetectionModel.RetinaFace);
var faces = detector.Detect(image);
foreach (var face in faces)
{
Console.WriteLine($"Face at {face.BoundingBox}");
Console.WriteLine($" Landmarks: {face.Landmarks.Count}");
}var recognizer = await FaceRecognizer<float>.LoadAsync(
model: FaceRecognitionModel.ArcFace);
// Get face embedding
var embedding = recognizer.GetEmbedding(faceImage);
// Compare faces
var similarity = recognizer.Compare(face1, face2);
Console.WriteLine($"Similarity: {similarity:F4}"); // 0-1, higher = more similarvar poseEstimator = await PoseEstimator<float>.LoadAsync(
model: PoseModel.HRNet);
var poses = poseEstimator.Estimate(image);
foreach (var pose in poses)
{
foreach (var keypoint in pose.Keypoints)
{
Console.WriteLine($"{keypoint.Name}: ({keypoint.X}, {keypoint.Y}) conf={keypoint.Confidence:F2}");
}
}var diffusion = await StableDiffusion<float>.LoadAsync(
version: StableDiffusionVersion.V2_1);
var image = await diffusion.GenerateAsync(
prompt: "a beautiful sunset over mountains, photorealistic",
negativePrompt: "blurry, low quality",
width: 512,
height: 512,
numInferenceSteps: 50,
guidanceScale: 7.5);
await image.SaveAsync("generated.png");var result = await diffusion.Img2ImgAsync(
image: inputImage,
prompt: "convert to oil painting style",
strength: 0.7);var result = await diffusion.InpaintAsync(
image: inputImage,
mask: maskImage, // White = area to inpaint
prompt: "a red sports car");var augmentation = new ImageAugmentation()
// Geometric transforms
.RandomHorizontalFlip(probability: 0.5)
.RandomVerticalFlip(probability: 0.1)
.RandomRotation(degrees: 15)
.RandomAffine(translate: (0.1, 0.1), scale: (0.9, 1.1))
.RandomCrop(224, 224)
.RandomResizedCrop(224, scale: (0.8, 1.0))
// Color transforms
.ColorJitter(brightness: 0.2, contrast: 0.2, saturation: 0.2, hue: 0.1)
.RandomGrayscale(probability: 0.1)
.GaussianBlur(kernelSize: 3)
// Normalization
.Normalize(mean: ImageNet.Mean, std: ImageNet.Std);
// Apply to dataset
var augmentedImage = augmentation.Apply(image);// Load and preprocess image
var image = await Image.LoadAsync("photo.jpg");
// Resize
var resized = image.Resize(224, 224, InterpolationMode.Bilinear);
// Convert to tensor
var tensor = resized.ToTensor(); // [C, H, W] normalized to [0, 1]
// Normalize for ImageNet
var normalized = tensor.Normalize(
mean: new[] { 0.485f, 0.456f, 0.406f },
std: new[] { 0.229f, 0.224f, 0.225f });
// Add batch dimension
var batch = normalized.Unsqueeze(0); // [1, C, H, W]// Export to ONNX for deployment
await model.ExportOnnxAsync("model.onnx", inputShape: new[] { 1, 3, 224, 224 });var optimized = await model.OptimizeForTensorRTAsync(
precision: Precision.FP16,
batchSize: 8);- Neural Networks - Build custom architectures
- NLP - Text processing
- Audio Processing - Audio models
- Distributed Training - Scale training
Getting Started
Core Concepts
Reference
Community