Drag-and-drop file upload with document checklist
Since we've already collected comprehensive data through CM02 (financial review via TrueLayer + Xero) and CM04 (identity verification via Onfido), additional documents are supplementary only.
Purpose: Speed up approval or provide additional context, but not required for decision.
// After successful upload
<div class="file-preview">
<div class="file-info">
<div class="file-icon">📄</div>
<div class="file-details">
<div class="file-name">business-plan.pdf</div>
<div class="file-meta">2.4 MB • Uploaded just now</div>
</div>
</div>
<div class="file-actions">
<button onclick="viewFile()">View</button>
<button onclick="removeFile()">×</button>
</div>
</div>
Multipart file upload and document management endpoints
POST /api/v1/documents/upload Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9... Content-Type: multipart/form-data; boundary=----WebKitFormBoundary ------WebKitFormBoundary Content-Disposition: form-data; name="file"; filename="business-plan.pdf" Content-Type: application/pdf [Binary file data] ------WebKitFormBoundary Content-Disposition: form-data; name="application_id" APP-2025-001234 ------WebKitFormBoundary Content-Disposition: form-data; name="document_type" business_plan ------WebKitFormBoundary Content-Disposition: form-data; name="category" supporting_documents ------WebKitFormBoundary--
{
"request_id": "req_cm08_20251123_150000",
"status": "success",
"document": {
"id": "DOC-2025-001234-001",
"filename": "business-plan.pdf",
"file_size": 2457600, // 2.4 MB in bytes
"file_type": "application/pdf",
"document_type": "business_plan",
"category": "supporting_documents",
"uploaded_at": "2025-11-23T15:00:00Z",
"s3_url": "s3://aina-docs/APP-2025-001234/DOC-001.pdf",
"view_url": "/api/v1/documents/DOC-2025-001234-001/view"
},
"classification": {
"status": "processing",
"message": "Document is being analyzed..."
},
"processing_time_ms": 1847
}
GET /api/v1/documents/list?application_id=APP-2025-001234 Response: { "application_id": "APP-2025-001234", "total_documents": 2, "documents": [ { "id": "DOC-2025-001234-001", "filename": "business-plan.pdf", "document_type": "business_plan", "file_size": 2457600, "uploaded_at": "2025-11-23T15:00:00Z", "classification": "business_plan", "confidence": 0.95 }, { "id": "DOC-2025-001234-002", "filename": "trading-license.jpg", "document_type": "business_license", "file_size": 1534720, "uploaded_at": "2025-11-23T15:02:15Z", "classification": "license", "confidence": 0.89 } ] }
AI document classification and OCR extraction
async function classifyDocument(documentId, filePath) { // Extract text from first page using OCR const firstPageText = await extractTextFromPDF(filePath, page=1); // Get filename for additional context const filename = path.basename(filePath); // Call ML model API const classification = await mlModel.classify({ text: firstPageText, filename: filename, max_length: 512 // Token limit for DistilBERT }); // Store classification result await Document.update(documentId, { classification: classification.category, classification_confidence: classification.confidence, classified_at: new Date() }); return classification; } // Example output: // { category: "business_plan", confidence: 0.95 }
async function extractTextFromPDF(s3Bucket, s3Key, page=1) { const params = { Document: { S3Object: { Bucket: s3Bucket, Name: s3Key } }, FeatureTypes: ['TABLES', 'FORMS'] }; const response = await textract.analyzeDocument(params).promise(); // Extract text blocks from specified page const pageBlocks = response.Blocks.filter( block => block.BlockType === 'LINE' && block.Page === page ); const text = pageBlocks.map(block => block.Text).join(' '); return text; }
async function validateDocument(file) { // 1. Check file size if (file.size > 10 * 1024 * 1024) { throw new ValidationError("File too large (max 10MB)"); } // 2. Check file type const allowedTypes = ['application/pdf', 'image/jpeg', 'image/png']; if (!allowedTypes.includes(file.mimetype)) { throw new ValidationError("Invalid file type"); } // 3. Virus scan const scanResult = await clamav.scanFile(file.path); if (scanResult.isInfected) { throw new SecurityError("File contains malware"); } // 4. Check duplicate const fileHash = calculateSHA256(file.buffer); const existingDoc = await Document.findByHash(fileHash); if (existingDoc) { return { warning: "This file was already uploaded" }; } return { valid: true }; }
S3 storage, async processing, and notifications
async function uploadToS3(file, applicationId, documentId) { const s3Key = `${applicationId}/${documentId}${path.extname(file.originalname)}`; const uploadParams = { Bucket: 'aina-docs', Key: s3Key, Body: file.buffer, ContentType: file.mimetype, ServerSideEncryption: 'AES256', Metadata: { application_id: applicationId, uploaded_by: file.userId, uploaded_at: new Date().toISOString() } }; const result = await s3.upload(uploadParams).promise(); return { s3_url: result.Location, s3_key: s3Key, etag: result.ETag }; }
Speed: Don't make customer wait 2-3 seconds for OCR + classification
Scalability: Process multiple documents in parallel
Reliability: SQS ensures classification happens even if worker temporarily down
{
"event": "document_classified",
"document_id": "DOC-2025-001234-001",
"classification": {
"category": "business_plan",
"confidence": 0.95,
"processing_time_ms": 487
},
"timestamp": "2025-11-23T15:00:03Z"
}
AWS services for storage, OCR, and virus scanning
const textract = new AWS.Textract(); const params = { Document: { S3Object: { Bucket: 'aina-docs', Name: 'APP-2025-001234/DOC-001.pdf' } }, FeatureTypes: ['TABLES', 'FORMS'] }; const result = await textract.analyzeDocument(params).promise(); // Extract text blocks const textBlocks = result.Blocks .filter(b => b.BlockType === 'LINE') .map(b => b.Text); const fullText = textBlocks.join(' ');
Document metadata storage and audit logging
| Column | Type | Description | Example |
|---|---|---|---|
id |
VARCHAR(50) | Primary key | DOC-2025-001234-001 |
application_id |
VARCHAR(50) | Foreign key to applications | APP-2025-001234 |
filename |
VARCHAR(255) | Original filename | business-plan.pdf |
file_size |
INTEGER | Size in bytes | 2457600 |
file_type |
VARCHAR(100) | MIME type | application/pdf |
document_type |
VARCHAR(50) | User-selected category | business_plan |
s3_bucket |
VARCHAR(100) | S3 bucket name | aina-docs |
s3_key |
VARCHAR(500) | S3 object key | APP-2025-001234/DOC-001.pdf |
s3_etag |
VARCHAR(100) | S3 ETag for integrity | "a8b3c5d7..." |
file_hash |
VARCHAR(64) | SHA-256 hash | 7f4b8c... |
classification |
VARCHAR(50) | AI-determined category | business_plan |
classification_confidence |
DECIMAL(4,3) | Confidence score | 0.950 |
ocr_text |
TEXT | Extracted text (first page) | "Business Plan 2025..." |
virus_scan_status |
VARCHAR(20) | Clean/Infected/Pending | clean |
uploaded_by |
VARCHAR(50) | User ID | CUST-001234 |
uploaded_at |
TIMESTAMP | Upload timestamp | 2025-11-23 15:00:00 |
classified_at |
TIMESTAMP | When AI classified | 2025-11-23 15:00:03 |
deleted_at |
TIMESTAMP | Soft delete timestamp | NULL |
CREATE TABLE documents ( id VARCHAR(50) PRIMARY KEY, application_id VARCHAR(50) NOT NULL REFERENCES applications(id), filename VARCHAR(255) NOT NULL, file_size INTEGER NOT NULL, file_type VARCHAR(100) NOT NULL, document_type VARCHAR(50), s3_bucket VARCHAR(100) NOT NULL, s3_key VARCHAR(500) NOT NULL, s3_etag VARCHAR(100), file_hash VARCHAR(64) NOT NULL, classification VARCHAR(50), classification_confidence DECIMAL(4,3), ocr_text TEXT, virus_scan_status VARCHAR(20) DEFAULT 'pending', uploaded_by VARCHAR(50) NOT NULL, uploaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, classified_at TIMESTAMP, deleted_at TIMESTAMP, CONSTRAINT check_file_size CHECK (file_size <= 10485760) -- 10MB ); CREATE INDEX idx_documents_app ON documents(application_id); CREATE INDEX idx_documents_hash ON documents(file_hash); CREATE INDEX idx_documents_uploaded ON documents(uploaded_at);
INSERT INTO documents ( id, application_id, filename, file_size, file_type, document_type, s3_bucket, s3_key, s3_etag, file_hash, virus_scan_status, uploaded_by ) VALUES ( 'DOC-2025-001234-001', 'APP-2025-001234', 'business-plan.pdf', 2457600, 'application/pdf', 'business_plan', 'aina-docs', 'APP-2025-001234/DOC-001.pdf', 'a8b3c5d7...', '7f4b8c...', 'clean', 'CUST-001234' );
UPDATE documents SET classification = 'business_plan', classification_confidence = 0.950, ocr_text = 'Business Plan 2025 - Smith''s Artisan Café...', classified_at = CURRENT_TIMESTAMP WHERE id = 'DOC-2025-001234-001';
{
"event_type": "document_uploaded",
"application_id": "APP-2025-001234",
"document_id": "DOC-2025-001234-001",
"customer_id": "CUST-001234",
"timestamp": "2025-11-23T15:00:00Z",
"document_details": {
"filename": "business-plan.pdf",
"file_size": 2457600,
"file_type": "application/pdf",
"document_type": "business_plan"
},
"classification": {
"category": "business_plan",
"confidence": 0.950,
"classified_at": "2025-11-23T15:00:03Z"
},
"user_agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0)",
"ip_address": "86.134.x.x"
}
File exceeds 10MB limit
File type not in whitelist (e.g., .exe, .zip)
ClamAV detects malware in file
Network error or S3 service unavailable
File hash matches existing document
ML model error or OCR extraction failed
ML model confidence <60%
PDF cannot be opened or read
Application has 10+ documents already
JWT token expired during upload