skip to Main Content

I’m working on a text recognition project using Apple’s Vision framework and am trying to leverage VNRecognizeTextRequest for extracting bounding boxes of text in images. My goal is to accurately identify and obtain bounding boxes for individual words rather than entire sentences or text blocks.

Most resources and guides I’ve come across focus on obtaining bounding boxes for sentences or larger text blocks, with minimal discussion on targeting individual words specifically. Additionally, I’m aware that VNRecognizeTextRequest offers two recognition accuracies: .fast and .accurate. Based on my testing, the .fast mode’s recognition quality is significantly inferior to .accurate, thus I prefer using the latter to ensure the highest quality of text recognition.

However, I’m encountering a challenge: despite setting the recognition to .accurate, I’m still unable to obtain bounding boxes for individual words—the results still encompass larger text blocks.

Does anyone know how to configure VNRecognizeTextRequest, or use any other method, to specifically detect bounding boxes for individual words? Am I missing any specific settings or parameters that could refine the detection to this level of detail?

I would greatly appreciate any advice or guidance on this matter. Thank you!

Their is a simple example to get sentences boundingBox:

import SwiftUI
import Vision

struct OCR: View {
  @State var image: UIImage? = UIImage(named: "test")
  @State var texts: [String] = []
  @State var positions: [CGRect] = []
  
  @State var size: CGSize = CGSize()
  
  func VNImageRectForNormalizedRect(rect: CGRect, imageSize: CGSize) -> CGRect {
    let width = imageSize.width
    let height = imageSize.height
    
    let x = rect.minX * width
    let y = (1 - rect.maxY) * height
    let rectWidth = rect.width * width
    let rectHeight = rect.height * height
    
    return CGRect(x: x, y: y, width: rectWidth, height: rectHeight)
  }
  
  var body: some View {
    ZStack {
      if let image = image {
        Image(uiImage: image)
          .resizable()
          .aspectRatio(contentMode: .fit)
          .background {
            GeometryReader { geo in
              Color.clear
                .onAppear {
                  size = geo.size
                }
            }
          }
          .overlay(Canvas { context, size in
            for position in positions {
              let normalizedRect = VNImageRectForNormalizedRect(rect: position, imageSize: size)
              context.stroke(Path(normalizedRect), with: .color(.red), lineWidth: 1)
            }
          })
          .onAppear {
            recognizeText(image: image) { t, p in
              texts = t
              positions = p
            }
          }
      } else {
        Text("Their is no picture")
      }
    }
  }
}

extension ContentView {
  func recognizeText(image: UIImage, completion: @escaping([String], [CGRect]) -> Void) {
    var texts: [String] = []
    var positions: [CGRect] = []
    
    guard let cgImage = image.cgImage else { return }
    let request = VNRecognizeTextRequest { (request, error) in
      guard let observations = request.results as? [VNRecognizedTextObservation], error == nil else {
        print("Text recognition error: (error?.localizedDescription ?? "Unknown error")")
        return
      }
      for observation in observations {
        guard let topCandidate = observation.topCandidates(1).first else { continue }
        texts.append(topCandidate.string)
        positions.append(observation.boundingBox)
      }
      DispatchQueue.main.async {
        print(texts)
        print(positions)
        completion(texts, positions)
      }
    }
    request.recognitionLevel = .accurate
    
    let handler = VNImageRequestHandler(cgImage: cgImage)
    try? handler.perform([request])
  }
}

#Preview {
  OCR()
}

test Image

2

Answers


  1. Chosen as BEST ANSWER

    It's really takes time to solve this, their is a definition document https://developer.apple.com/documentation/vision/vnrecognizedtext/3213755-boundingbox

    At first I didn't know how to use it at all, I even forgot where I found the answer.

    But finally I got this o(>v<)o

    The simple example:

    Image:

    simple example result

    Complete Code:

    import SwiftUI
    import Vision
    
    struct OCR: View {
      @State var image: UIImage? = UIImage(named: "test")
      @State var texts: [[String]] = []
      @State var positions: [[CGRect]] = []
      
      @State var size: CGSize = CGSize()
      
      func VNImageRectForNormalizedRect(rect: CGRect, imageSize: CGSize) -> CGRect {
        let width = imageSize.width
        let height = imageSize.height
        
        let x = rect.minX * width
        let y = rect.maxY * height
        let rectWidth = rect.width * width
        let rectHeight = rect.height * height
        
        return CGRect(x: x, y: y, width: rectWidth, height: rectHeight)
      }
      
      var body: some View {
        ZStack {
          if let image = image {
            Image(uiImage: image)
              .resizable()
              .aspectRatio(contentMode: .fit)
              .overlay {
                GeometryReader { geo in
                  Color.clear
                    .border(.blue)
                    .onAppear {
                      size = geo.size
                    }
                }
              }
              .overlay(Canvas { context, size in
                for linePositions in positions {
                  for position in linePositions {
                    let normalizedRect = VNImageRectForNormalizedRect(rect: position, imageSize: size)
                    context.stroke(Path(normalizedRect), with: .color(.red), lineWidth: 1)
                  }
                }
              })
              .onAppear {
                recognizeText(image: image) { t, p in
                  texts = t
                  positions = p
                }
              }
          } else {
            Text("Their is no picture")
          }
        }
      }
    }
    
    extension OCR {
      func recognizeText(image: UIImage, completion: @escaping([[String]], [[CGRect]]) -> Void) {
        var texts: [[String]] = []
        var positions: [[CGRect]] = []
        
        guard let cgImage = image.cgImage else { return }
        let request = VNRecognizeTextRequest { (request, error) in
          guard let observations = request.results as? [VNRecognizedTextObservation], error == nil else {
            print("Text recognition error: (error?.localizedDescription ?? "Unknown error")")
            return
          }
          for observation in observations {
            guard let topCandidate = observation.topCandidates(1).first else { continue }
            
            
            var lineWords: [String] = []
            var lineRects: [CGRect] = []
            
            var lineBoundingBoxs: [CGRect] = []
            
            for (index, character) in topCandidate.string.enumerated() {
              let startIndex = topCandidate.string.index(topCandidate.string.startIndex, offsetBy: index)
              let endIndex = topCandidate.string.index(startIndex, offsetBy: 1)
              
              let range = startIndex..<endIndex
              
              if let wordBox = try? topCandidate.boundingBox(for: range) {
                let boundingBox = wordBox.boundingBox
                if character != " " {
                  if boundingBox != lineBoundingBoxs.last {
                    lineWords.append("")
                    lineBoundingBoxs.append(boundingBox)
                    let rect = CGRect(x: boundingBox.minX, y: 1 - boundingBox.maxY - boundingBox.height, width: boundingBox.width, height: boundingBox.height)
                    lineRects.append(rect)
                  }
                  lineWords[lineWords.count - 1] += String(character)
                }
              }
              
              texts.append(lineWords)
              positions.append(lineRects)
            }
          }
          
          DispatchQueue.main.async {
            print(texts)
            print(positions)
            completion(texts, positions)
          }
        }
        request.recognitionLevel = .accurate
        
        let handler = VNImageRequestHandler(cgImage: cgImage)
        try? handler.perform([request])
      }
    }
    
    #Preview {
      OCR()
    }
    

    The specific idea is to traverse each character in the sentence and find its corresponding position. The one with the same position is a word.

    for observation in observations {
      guard let topCandidate = observation.topCandidates(1).first else { continue }
      
      
      var lineWords: [String] = []
      var lineRects: [CGRect] = []
      
      linePositions.append(observation.boundingBox)
      for (index, character) in topCandidate.string.enumerated() {
        let startIndex = topCandidate.string.index(topCandidate.string.startIndex, offsetBy: index)
        let endIndex = topCandidate.string.index(startIndex, offsetBy: 1)
        
        let range = startIndex..<endIndex
        
        if let wordBox = try? topCandidate.boundingBox(for: range) {
          let boundingBox = wordBox.boundingBox
          if character != " " {
            if boundingBox != lineBoundingBoxs.last {
              lineWords.append("")
              lineBoundingBoxs.append(boundingBox)
              let rect = CGRect(x: boundingBox.minX, y: 1 - boundingBox.maxY - boundingBox.height, width: boundingBox.width, height: boundingBox.height)
              lineRects.append(rect)
            }
            lineWords[lineWords.count - 1] += String(character)
          }
        }
      }
    }
    

    Explanation:

    For example, suppose the following sentence is an observation

    • The string info: "I am a junior programmer"
    • The BoundingBox info: BoundingBox Info

    Iterate through each character, you may thought it will get the Character Level BoundingBox of "j" from "junior", but actually it will get the whole Word Level BoundingBox, which is the whole green BoundingBox.

    I means what ever you input the range of "junior" word for ["j", "u", "n", "i", "o", "r"], it will return [GreenBB, GreenBB, GreenBB, GreenBB, GreenBB, GreenBB] (which BB is BoundingBox)

    Another thing to mention is that if the input is " ", which is a space, the position will output the BB with a width of 0 (I remember it seems to be similar to CGRect(x: 0, y: 0, width: 1, height: 0). So it means we can just ignore the " " string.

    Which same BoundingBox means one word, this means we can just get the words' string and the corresponding BB.


    Of course, some people will ask, why not process and split the string with spaces first, and then find the BB directly through Range after getting the word? Isn't this much better and faster than traversing the entire string? But I want to tell you that I designed it this way before, but it should be because the Range corresponding to the word String after segmentation does not completely correspond to the actual Range, so it cannot be obtained, which results in many words not being tagged out of the question.



    More linear regression for BB text boxes to find more accurate non-rectangular text boxes, the result after precisely processed photos: Precisely processed photos


  2. To achieve word-level bounding boxes using the Vision framework’s VNRecognizeTextRequest in SwiftUI, you need to make some adjustments to your code. Currently, your code extracts text blocks and their bounding boxes. Here’s how you can modify it to obtain word-level bounding boxes

    import SwiftUI
    import Vision
    
    struct OCR: View {
        @State var image: UIImage? = UIImage(named: "test")
        @State var wordRects: [CGRect] = []
        
        func VNImageRectForNormalizedRect(rect: CGRect, imageSize: CGSize) -> CGRect {
            let width = imageSize.width
            let height = imageSize.height
            
            let x = rect.minX * width
            let y = (1 - rect.maxY) * height
            let rectWidth = rect.width * width
            let rectHeight = rect.height * height
            
            return CGRect(x: x, y: y, width: rectWidth, height: rectHeight)
        }
        
        var body: some View {
            ZStack {
                if let image = image {
                    Image(uiImage: image)
                        .resizable()
                        .aspectRatio(contentMode: .fit)
                        .overlay(Canvas { context, size in
                            for rect in wordRects {
                                let normalizedRect = VNImageRectForNormalizedRect(rect: rect, imageSize: image.size)
                                context.stroke(Path(normalizedRect), with: .color(.red), lineWidth: 1)
                            }
                        })
                        .onAppear {
                            recognizeText(image: image)
                        }
                } else {
                    Text("There is no picture")
                }
            }
        }
        
        func recognizeText(image: UIImage) {
            guard let cgImage = image.cgImage else { return }
            
            let request = VNRecognizeTextRequest { (request, error) in
                guard let observations = request.results as? [VNRecognizedTextObservation], error == nil else {
                    print("Text recognition error: (error?.localizedDescription ?? "Unknown error")")
                    return
                }
                
                var wordRects: [CGRect] = []
                
                for observation in observations {
                    guard let candidate = observation.topCandidates(1).first else { continue }
                    
                    let boundingBox = observation.boundingBox
                    let transformedRect = boundingBox.applying(CGAffineTransform(scaleX: 1, y: -1).translatedBy(x: 0, y: 1))
                    let normalizedRect = VNImageRectForNormalizedRect(rect: transformedRect, imageSize: image.size)
                    
                    let words = candidate.string.components(separatedBy: " ")
                    let wordCount = words.count
                    let wordWidth = normalizedRect.width / CGFloat(wordCount)
                    
                    for (index, word) in words.enumerated() {
                        let x = normalizedRect.origin.x + CGFloat(index) * wordWidth
                        let wordRect = CGRect(x: x, y: normalizedRect.origin.y, width: wordWidth, height: normalizedRect.height)
                        wordRects.append(wordRect)
                    }
                }
                
                DispatchQueue.main.async {
                    self.wordRects = wordRects
                }
            }
            
            request.recognitionLevel = .accurate
            
            let handler = VNImageRequestHandler(cgImage: cgImage)
            try? handler.perform([request])
        }
    }
    
    struct OCR_Previews: PreviewProvider {
        static var previews: some View {
            OCR()
        }
    }
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search