Part 10: OCR

OCR items in a case.

In this example, you will ingest the Nuix logo below into your case and perform Optical Character Recognition (OCR) on it. Right click on the image below in your browser to save the image to disk as nuix.png.

Prerequisites

  1. Install the OCR plugin from https://download.nuix.com/releases/addons.
  2. Verify that the OCR plugin is installed by calling the resources/ocr endpoint.
curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/resources/ocr' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
    "success": true
}

File Ingestion of Image

Ingest your image with text.

curl --location --request POST 'http://localhost:8080/nuix-restful-service/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/evidence/file' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d' \
--header 'Content-Type: application/json' \
--header 'Accept: application/json' \
--data-raw '{
   "processorSettings": {
   	"storeBinary": true
   },
   "target": {
    "path": "/Images/nuix.png"
  }
}
'
{
    "functionKey": "f599c065-523e-4dd5-8ab1-82cad97afe8b",
    "location": "http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f599c065-523e-4dd5-8ab1-82cad97afe8b"
}

File Ingestion Status

The functionKey field returned from the ingestion endpoint above can be polled for status.

curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f599c065-523e-4dd5-8ab1-82cad97afe8b' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
    "done": true,
    "cancelled": false,
    "result": true,
    "token": "a3b10cf2-1c75-4507-86dd-865eb56555b7",
    "functionKey": "f599c065-523e-4dd5-8ab1-82cad97afe8b",
    "progress": 7011,
    "total": 0,
    "percentComplete": null,
    "updatedOn": 1613506664487,
    "status": null,
    "statusId": null,
    "requestTime": 1613506653553,
    "startTime": 1613506653553,
    "finishTime": 1613506666353,
    "caseId": "43b070164ce8453ca30ed9e2dfcce67b",
    "caseName": "HelloWorld",
    "hasSuccessfullyCompleted": true,
    "friendlyName": "Evidence Ingestion Function",
    "caseLocation": "/Cases/HelloWorld",
    "requestor": "nuixadmin",
    "action": "AsyncBulkIngestionFunction",
    "options": {
        "reloadQuery": null,
        "processorSettings": {
            "processText": null,
            "processLooseFileContents": null,
            "processForensicImages": null,
            "analysisLanguage": null,
            "stopWords": null,
            "stemming": null,
            "enableExactQueries": null,
            "extractNamedEntities": null,
            "extractNamedEntitiesFromText": null,
            "extractNamedEntitiesFromProperties": null,
            "extractNamedEntitiesFromTextStripped": null,
            "extractShingles": null,
            "processTextSummaries": null,
            "calculateSSDeepFuzzyHash": null,
            "detectFaces": null,
            "extractFromSlackSpace": null,
            "carveFileSystemUnallocatedSpace": null,
            "carveUnidentifiedData": null,
            "carvingBlockSize": null,
            "recoverDeletedFiles": null,
            "extractEndOfFileSlackSpace": null,
            "smartProcessRegistry": null,
            "identifyPhysicalFiles": null,
            "createThumbnails": null,
            "skinToneAnalysis": null,
            "calculateAuditedSize": null,
            "storeBinary": true,
            "maxStoredBinarySize": null,
            "maxDigestSize": null,
            "digests": [],
            "addBccToEmailDigests": null,
            "addCommunicationDateToEmailDigests": null,
            "reuseEvidenceStores": null,
            "processFamilyFields": null,
            "hideEmbeddedImmaterialData": null,
            "reportProcessingStatus": null,
            "workerItemCallback": null,
            "workerItemCallbacks": null
        },
        "evidence": [
            {
                "guid": null,
                "name": null,
                "customMetadata": null,
                "encoding": null,
                "custodian": null,
                "timeZone": null,
                "description": null,
                "locale": null,
                "files": [
                    {
                        "path": "/Images/nuix.png"
                    }
                ],
                "exchangeMailboxes": null,
                "s3Buckets": null,
                "sqlServers": null,
                "enterpriseVaults": null,
                "sharepointSites": null,
                "mailStores": null,
                "loadFiles": null,
                "centeraClusters": null,
                "splitFiles": null,
                "dropboxes": null,
                "sshServers": null
            }
        ],
        "localWorkerCount": 1,
        "repositories": [],
        "parallelProcessingSettings": {
            "workerCount": null,
            "workerMemory": null,
            "workerTemp": null,
            "brokerMemory": null,
            "workerBrokerAddress": null,
            "useRemoteWorkers": false,
            "embedBroker": true
        },
        "rescanEvidenceRepositories": false,
        "loadProcessingJob": {
            "casePath": "/Cases/HelloWorld",
            "jobGuid": "55480a21-720c-4f3e-baa6-c7c5b53b28b0",
            "processingMode": "Load",
            "startDate": 1613506654064,
            "workerCount": 1,
            "finished": true,
            "paused": false,
            "masterAddress": "1.1.1.1",
            "bytesProcessed": 7011,
            "itemsProcessed": 1,
            "jobSizeTotalBytes": 0
        }
    },
    "participatingInCaseFunctionQueue": true,
    "processedBy": "nuix-restful-server-1",
    "errorMsg": null
}

Item Verification

Verify that you have the item (nuix.png) in your case by doing a search. You can also use the search to retrieve the guid of the item for OCR.

curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v2/cases/43b070164ce8453ca30ed9e2dfcce67b/search?query=nuix.png&metadataProfile=Default&numberOfRecordsRequested=100' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
    "request": {
        "caseId": "43b070164ce8453ca30ed9e2dfcce67b",
        "query": "nuix.png",
        "sortField": null,
        "sortOrder": null,
        "startIndex": 0,
        "numberOfRecordsRequested": 1,
        "deduplicate": null,
        "metadataProfile": "Default",
        "fieldList": null,
        "customMetadataList": null,
        "propertyList": null,
        "itemParameterizedFields": null,
        "showAvailableThumbnails": false,
        "useCache": false,
        "forceCacheDelete": false,
        "searchRetry": 0,
        "relationType": null,
        "entities": [],
        "s": 0,
        "p": 1,
        "customMetadataField": null,
        "field": null,
        "property": null
    },
    "startedOn": 1613507186542,
    "completedOn": 1613507186598,
    "elapsedTimeForSearch": 50,
    "elapsedTimeForSort": 0,
    "elapsedTimeForMarshal": 1,
    "elapsedTimeForDeduplicate": 0,
    "elapsedTotal": 56,
    "metadataItems": [
        "Name",
        "File Type",
        "Path Name"
    ],
    "localizedMetadataItems": [
        "Name",
        "File Type",
        "Path Name"
    ],
    "metadataItemDetails": [
        {
            "name": "Name",
            "localisedName": "Name",
            "type": "String"
        },
        {
            "name": "File Type",
            "localisedName": "File Type",
            "type": "String"
        },
        {
            "name": "Path Name",
            "localisedName": "Path Name",
            "type": "String"
        }
    ],
    "resultList": [
        {
            "File Type": "Portable Network Graphic",
            "Name": "nuix.png",
            "Path Name": "/e1746668-8eca-48b2-a8bd-14f17c59c1e1",
            "guid": "519ca60c-a397-4d1b-a5aa-547287a8ad1a"
        }
    ],
    "count": 1,
    "deduplicatedCount": 1
}

OCR

You can now OCR the item using the guid returned from your search query.

519ca60c-a397-4d1b-a5aa-547287a8ad1a

curl --location --request PUT 'http://localhost:8080/nuix-restful-service/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/items/ocr' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
--header 'Content-Type: application/json' \
--header 'Accept: application/json' \
--data-raw '{
  "parallelProcessingSettings": {
    "embedBroker": true,
    "workerCount": 4,
    "workerMemory": 2048
  },
  "query": "guid:519ca60c-a397-4d1b-a5aa-547287a8ad1a"
}'
{
    "functionKey": "f534cd08-c374-4c7a-acf4-32ec613adbff",
    "location": "http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f534cd08-c374-4c7a-acf4-32ec613adbff"
}

OCR Status

The functionKey field returned from the OCR endpoint above can be polled for status.

curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f534cd08-c374-4c7a-acf4-32ec613adbff' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
    "done": true,
    "cancelled": false,
    "result": {
        "total": 1,
        "failure": 0
    },
    "token": "a3b10cf2-1c75-4507-86dd-865eb56555b7",
    "functionKey": "f534cd08-c374-4c7a-acf4-32ec613adbff",
    "progress": 1,
    "total": 1,
    "percentComplete": 100.0000,
    "updatedOn": 1613507447309,
    "status": null,
    "statusId": null,
    "requestTime": 1613507437697,
    "startTime": 1613507437698,
    "finishTime": 1613507449524,
    "caseId": "43b070164ce8453ca30ed9e2dfcce67b",
    "caseName": "HelloWorld",
    "hasSuccessfullyCompleted": true,
    "friendlyName": "OCR Function",
    "caseLocation": "/Cases/HelloWorld",
    "requestor": "username",
    "action": "AsyncOcrFunction",
    "options": {
        "ocrOptions": {
            "regeneratePdfs": null,
            "updatePdf": null,
            "updateText": null,
            "textModification": null,
            "quality": null,
            "rotation": null,
            "deskew": null,
            "clearOcrCache": null,
            "outputDirectory": null,
            "languages": [
                "ENGLISH"
            ],
            "timeout": null,
            "updateDuplicates": null,
            "ocrProfileName": null
        },
        "localWorkerCount": 1,
        "parallelProcessingSettings": {
            "workerCount": 4,
            "workerMemory": 2048,
            "workerTemp": null,
            "brokerMemory": null,
            "workerBrokerAddress": null,
            "useRemoteWorkers": false,
            "embedBroker": true
        },
        "query": "guid:519ca60c-a397-4d1b-a5aa-547287a8ad1a",
        "ocrProfile": null,
        "ocrImagingSettings": null,
        "exportProcessingJob": {
            "casePath": "/Cases/HelloWorld",
            "jobGuid": "145e16b9-7800-4240-b108-a2e8ecc58724",
            "processingMode": "Export",
            "startDate": 1613507437889,
            "workerCount": 9,
            "finished": true,
            "paused": false,
            "masterAddress": "1.1.1.1",
            "currentStage": "TEXT_REPLACEMENT",
            "currentStageDuration": "0 seconds",
            "currentStageExportedItemsCount": 1,
            "failedItemCount": 0,
            "totalItemCount": 1
        },
        "imagingProfile": null,
        "tags": null
    },
    "participatingInCaseFunctionQueue": true,
    "processedBy": "nuix-restful-server-1",
    "errorMsg": null
}

Item Text

Finally, you can retrieve the item text of the image using the itemText endpoint.

curl --location --request GET 'http://localhost:8080/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/items/519ca60c-a397-4d1b-a5aa-547287a8ad1a/itemText' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
    "text": "nuix",
    "binaryAvailable": true,
    "htmlEscape": false,
    "totalTextLength": 4,
    "blank": false
}

The text field in the response is nuix and we have successfully performed an OCR on this image.