[mob][photos] Use existing image utils for clip preprocessing

2025-08-08 07:28:26 +00:00 · 2024-06-28 18:20:35 +05:30 · 2024-06-28 18:20:35 +05:30 · d7e1b737d8
commit d7e1b737d8
parent f03cea7252
2 changed files with 52 additions and 74 deletions
--- a/mobile/lib/services/machine_learning/semantic_search/frameworks/onnx/onnx_image_encoder.dart
+++ b/mobile/lib/services/machine_learning/semantic_search/frameworks/onnx/onnx_image_encoder.dart
@ -2,9 +2,9 @@ import "dart:io";
 import "dart:math";
 import "dart:typed_data";

-import 'package:image/image.dart' as img;
 import "package:logging/logging.dart";
 import "package:onnxruntime/onnxruntime.dart";
+import "package:photos/utils/image_ml_util.dart";

 class OnnxImageEncoder {
  final _logger = Logger("OnnxImageEncoder");
@ -27,80 +27,14 @@ class OnnxImageEncoder {
  }

  Future<List<double>> inferByImage(Map args) async {
-    final rgb = img.decodeImage(await File(args["imagePath"]).readAsBytes())!;
+    final imageData = await File(args["imagePath"]).readAsBytes();
+    final image = await decodeImageFromData(imageData);
+    final ByteData imgByteData = await getByteDataFromImage(image);

-    final int imageWidth = rgb.width;
-    final int imageHeight = rgb.height;
-    final int inputSize = 3 * imageWidth * imageHeight;
-    final inputImage = List.filled(inputSize, 0.toDouble());
-
-    const int requiredWidth = 224;
-    const int requiredHeight = 224;
-    const int totalSize = 3 * requiredWidth * requiredHeight;
-
-    // Load image into List<double> inputImage
-    for (int y = 0; y < imageHeight; y++) {
-      for (int x = 0; x < imageWidth; x++) {
-        final int i = 3 * (y * imageWidth + x);
-        final pixel = rgb.getPixel(x, y);
-        inputImage[i] = pixel.r.toDouble();
-        inputImage[i + 1] = pixel.g.toDouble();
-        inputImage[i + 2] = pixel.b.toDouble();
-      }
-    }
-
-    final result = List.filled(totalSize, 0.toDouble());
-    final invertedScale = max(imageWidth, imageHeight) / 224;
-
-    final int scaledWidth = (imageWidth / invertedScale + 0.5).toInt();
-    final int scaledHeight = (imageHeight / invertedScale + 0.5).toInt();
-
-    final mean = [0.48145466, 0.4578275, 0.40821073];
-    final std = [0.26862954, 0.26130258, 0.27577711];
-
-    for (int y = 0; y < scaledHeight; y++) {
-      for (int x = 0; x < scaledWidth; x++) {
-        for (int c = 0; c < 3; c++) {
-          //linear interpolation
-          final double scaledX = (x + 0.5) * invertedScale - 0.5;
-          final double scaledY = (y + 0.5) * invertedScale - 0.5;
-
-          final int x0 = max(0, scaledX.floor());
-          final int y0 = max(0, scaledY.floor());
-
-          final int x1 = min(x0 + 1, imageWidth - 1);
-          final int y1 = min(y0 + 1, imageHeight - 1);
-
-          final double dx = scaledX - x0;
-          final double dy = scaledY - y0;
-
-          final int j00 = 3 * (y0 * imageWidth + x0) + c;
-          final int j01 = 3 * (y0 * imageWidth + x1) + c;
-          final int j10 = 3 * (y1 * imageWidth + x0) + c;
-          final int j11 = 3 * (y1 * imageWidth + x1) + c;
-
-          final double pixel1 = inputImage[j00];
-          final double pixel2 = inputImage[j01];
-          final double pixel3 = inputImage[j10];
-          final double pixel4 = inputImage[j11];
-
-          final double v0 = pixel1 * (1 - dx) + pixel2 * dx;
-          final double v1 = pixel3 * (1 - dx) + pixel4 * dx;
-
-          final double v = v0 * (1 - dy) + v1 * dy;
-
-          final int v2 = min(max(v.round(), 0), 255);
-
-          final int i = (y * requiredWidth + x) + c * 224 * 224;
-
-          result[i] = ((v2 / 255) - mean[c]) / std[c];
-        }
-      }
-    }
-    final floatList = Float32List.fromList(result);
+    final inputList = await preprocessImageClip(image, imgByteData);

    final inputOrt =
-        OrtValueTensor.createTensorWithDataList(floatList, [1, 3, 224, 224]);
+        OrtValueTensor.createTensorWithDataList(inputList, [1, 3, 224, 224]);
    final inputs = {'input': inputOrt};
    final session = OrtSession.fromAddress(args["address"]);
    final runOptions = OrtRunOptions();
--- a/mobile/lib/utils/image_ml_util.dart
+++ b/mobile/lib/utils/image_ml_util.dart
@ -193,6 +193,48 @@ Future<(Float32List, Dimensions, Dimensions)>
  );
 }

+Future<Float32List> preprocessImageClip(
+  Image image,
+  ByteData imgByteData,
+) async {
+  const int requiredWidth = 224;
+  const int requiredHeight = 224;
+  const int requiredSize = 3 * requiredWidth * requiredHeight;
+  const mean = [0.48145466, 0.4578275, 0.40821073];
+  const std = [0.26862954, 0.26130258, 0.27577711];
+
+  final scale = min(requiredWidth / image.width, requiredHeight / image.height);
+  final scaledWidth = (image.width * scale).round().clamp(0, requiredWidth);
+  final scaledHeight = (image.height * scale).round().clamp(0, requiredHeight);
+
+  final processedBytes = Float32List(requiredSize);
+  final buffer = Float32List.view(processedBytes.buffer);
+  int pixelIndex = 0;
+  const int greenOff = requiredHeight * requiredWidth;
+  const int blueOff = 2 * requiredHeight * requiredWidth;
+  for (var h = 0; h < requiredHeight; h++) {
+    for (var w = 0; w < requiredWidth; w++) {
+      late Color pixel;
+      if (w >= scaledWidth || h >= scaledHeight) {
+        pixel = const Color.fromRGBO(114, 114, 114, 1.0);
+      } else {
+        pixel = _getPixelBicubic(
+          w / scale,
+          h / scale,
+          image,
+          imgByteData,
+        );
+      }
+      buffer[pixelIndex] = ((pixel.red / 255) - mean[0]) / std[0];
+      buffer[pixelIndex + greenOff] = ((pixel.green / 255) - mean[1]) / std[1];
+      buffer[pixelIndex + blueOff] = ((pixel.blue / 255) - mean[2]) / std[2];
+      pixelIndex++;
+    }
+  }
+
+  return processedBytes;
+}
+
 Future<(Float32List, List<AlignmentResult>, List<bool>, List<double>, Size)>
    preprocessToMobileFaceNetFloat32List(
  Image image,
@ -225,7 +267,9 @@ Future<(Float32List, List<AlignmentResult>, List<bool>, List<double>, Size)>
        SimilarityTransform.estimate(face.allKeypoints);
    if (!correctlyEstimated) {
      log('Face alignment failed because not able to estimate SimilarityTransform, for face: $face');
-      throw Exception('Face alignment failed because not able to estimate SimilarityTransform');
+      throw Exception(
+        'Face alignment failed because not able to estimate SimilarityTransform',
+      );
    }
    alignmentResults.add(alignmentResult);

@ -639,4 +683,4 @@ Color _getPixelBicubic(num fx, num fy, Image image, ByteData byteDataRgba) {
  // final c3 = cubic(dy, ip3, ic3, in3, ia3);

  return Color.fromRGBO(c0, c1, c2, 1.0);
-}
+}