Skip to content
Go back

Creating AI Embeddings with OpenAI for Semantic Search

Creating AI Embeddings with OpenAI for Semantic Search

Introduction

OpenAI embeddings convert text into high-dimensional vectors that capture semantic meaning. This guide implements semantic search using embeddings for intelligent document retrieval.

Prerequisites

Step 1: Install Dependencies

npm install openai pg @types/pg

Step 2: Setup Database with Vector Extension

Install pgvector extension in PostgreSQL:

-- Connect to your database and run:
CREATE EXTENSION vector;

-- Create table for storing documents and embeddings
CREATE TABLE documents (
  id SERIAL PRIMARY KEY,
  title TEXT NOT NULL,
  content TEXT NOT NULL,
  embedding VECTOR(1536), -- OpenAI ada-002 produces 1536-dimensional vectors
  created_at TIMESTAMP DEFAULT NOW()
);

-- Create index for vector similarity search
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops);

Step 3: Create Embedding Utilities

Create lib/embeddings.ts:

import OpenAI from 'openai';

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY!,
});

export async function createEmbedding(text: string): Promise<number[]> {
  try {
    const response = await openai.embeddings.create({
      model: 'text-embedding-ada-002',
      input: text,
    });

    return response.data[0].embedding;
  } catch (error) {
    console.error('Error creating embedding:', error);
    throw new Error('Failed to create embedding');
  }
}

export async function createEmbeddings(texts: string[]): Promise<number[][]> {
  try {
    const response = await openai.embeddings.create({
      model: 'text-embedding-ada-002',
      input: texts,
    });

    return response.data.map(item => item.embedding);
  } catch (error) {
    console.error('Error creating embeddings:', error);
    throw new Error('Failed to create embeddings');
  }
}

// Calculate cosine similarity between two vectors
export function cosineSimilarity(a: number[], b: number[]): number {
  const dotProduct = a.reduce((sum, val, i) => sum + val * b[i], 0);
  const magnitudeA = Math.sqrt(a.reduce((sum, val) => sum + val * val, 0));
  const magnitudeB = Math.sqrt(b.reduce((sum, val) => sum + val * val, 0));
  
  return dotProduct / (magnitudeA * magnitudeB);
}

Step 4: Database Operations

Create lib/database.ts:

import { Pool } from 'pg';

const pool = new Pool({
  connectionString: process.env.DATABASE_URL,
});

export interface Document {
  id: number;
  title: string;
  content: string;
  embedding?: number[];
  similarity?: number;
  created_at: Date;
}

export async function insertDocument(
  title: string,
  content: string,
  embedding: number[]
): Promise<Document> {
  const client = await pool.connect();
  
  try {
    const result = await client.query(
      'INSERT INTO documents (title, content, embedding) VALUES ($1, $2, $3) RETURNING *',
      [title, content, `[${embedding.join(',')}]`]
    );
    
    return result.rows[0];
  } finally {
    client.release();
  }
}

export async function searchSimilarDocuments(
  queryEmbedding: number[],
  limit: number = 10,
  threshold: number = 0.7
): Promise<Document[]> {
  const client = await pool.connect();
  
  try {
    const result = await client.query(
      `SELECT 
         id, title, content, created_at,
         1 - (embedding <=> $1) AS similarity
       FROM documents 
       WHERE 1 - (embedding <=> $1) > $2
       ORDER BY embedding <=> $1
       LIMIT $3`,
      [`[${queryEmbedding.join(',')}]`, threshold, limit]
    );
    
    return result.rows;
  } finally {
    client.release();
  }
}

export async function getAllDocuments(): Promise<Document[]> {
  const client = await pool.connect();
  
  try {
    const result = await client.query(
      'SELECT id, title, content, created_at FROM documents ORDER BY created_at DESC'
    );
    
    return result.rows;
  } finally {
    client.release();
  }
}

Step 5: Create API Routes

Create app/api/documents/route.ts:

import { NextRequest, NextResponse } from 'next/server';
import { insertDocument, getAllDocuments } from '@/lib/database';
import { createEmbedding } from '@/lib/embeddings';

export async function POST(req: NextRequest) {
  try {
    const { title, content } = await req.json();

    if (!title || !content) {
      return NextResponse.json(
        { error: 'Title and content are required' },
        { status: 400 }
      );
    }

    // Create embedding for the document content
    const embedding = await createEmbedding(content);
    
    // Store document with embedding
    const document = await insertDocument(title, content, embedding);
    
    return NextResponse.json({ 
      id: document.id,
      message: 'Document added successfully' 
    });
  } catch (error) {
    console.error('Error adding document:', error);
    return NextResponse.json(
      { error: 'Failed to add document' },
      { status: 500 }
    );
  }
}

export async function GET() {
  try {
    const documents = await getAllDocuments();
    return NextResponse.json({ documents });
  } catch (error) {
    console.error('Error fetching documents:', error);
    return NextResponse.json(
      { error: 'Failed to fetch documents' },
      { status: 500 }
    );
  }
}

Create app/api/search/route.ts:

import { NextRequest, NextResponse } from 'next/server';
import { searchSimilarDocuments } from '@/lib/database';
import { createEmbedding } from '@/lib/embeddings';

export async function POST(req: NextRequest) {
  try {
    const { query, limit = 10, threshold = 0.7 } = await req.json();

    if (!query) {
      return NextResponse.json(
        { error: 'Search query is required' },
        { status: 400 }
      );
    }

    // Create embedding for the search query
    const queryEmbedding = await createEmbedding(query);
    
    // Search for similar documents
    const results = await searchSimilarDocuments(
      queryEmbedding,
      limit,
      threshold
    );
    
    return NextResponse.json({ 
      results,
      query,
      count: results.length
    });
  } catch (error) {
    console.error('Error searching documents:', error);
    return NextResponse.json(
      { error: 'Failed to search documents' },
      { status: 500 }
    );
  }
}

Step 6: Create Search Interface

Create components/SemanticSearch.tsx:

'use client';

import { useState } from 'react';

interface SearchResult {
  id: number;
  title: string;
  content: string;
  similarity: number;
  created_at: string;
}

interface Document {
  id: number;
  title: string;
  content: string;
  created_at: string;
}

export default function SemanticSearch() {
  const [query, setQuery] = useState('');
  const [results, setResults] = useState<SearchResult[]>([]);
  const [documents, setDocuments] = useState<Document[]>([]);
  const [isSearching, setIsSearching] = useState(false);
  const [isAdding, setIsAdding] = useState(false);
  const [newTitle, setNewTitle] = useState('');
  const [newContent, setNewContent] = useState('');
  const [showAddForm, setShowAddForm] = useState(false);

  const handleSearch = async () => {
    if (!query.trim() || isSearching) return;

    setIsSearching(true);
    try {
      const response = await fetch('/api/search', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ query, limit: 10, threshold: 0.6 }),
      });

      const data = await response.json();
      setResults(data.results || []);
    } catch (error) {
      console.error('Search failed:', error);
      setResults([]);
    } finally {
      setIsSearching(false);
    }
  };

  const handleAddDocument = async () => {
    if (!newTitle.trim() || !newContent.trim() || isAdding) return;

    setIsAdding(true);
    try {
      const response = await fetch('/api/documents', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ title: newTitle, content: newContent }),
      });

      if (response.ok) {
        setNewTitle('');
        setNewContent('');
        setShowAddForm(false);
        await fetchDocuments(); // Refresh the documents list
      }
    } catch (error) {
      console.error('Failed to add document:', error);
    } finally {
      setIsAdding(false);
    }
  };

  const fetchDocuments = async () => {
    try {
      const response = await fetch('/api/documents');
      const data = await response.json();
      setDocuments(data.documents || []);
    } catch (error) {
      console.error('Failed to fetch documents:', error);
    }
  };

  const highlightText = (text: string, query: string) => {
    if (!query.trim()) return text;
    
    const regex = new RegExp(`(${query.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')})`, 'gi');
    return text.replace(regex, '<mark class="bg-yellow-200">$1</mark>');
  };

  // Fetch documents on component mount
  useState(() => {
    fetchDocuments();
  });

  return (
    <div className="max-w-6xl mx-auto p-6">
      <div className="mb-8">
        <h1 className="text-3xl font-bold mb-4">Semantic Search</h1>
        
        {/* Add Document Form */}
        <div className="mb-6">
          <button
            onClick={() => setShowAddForm(!showAddForm)}
            className="px-4 py-2 bg-green-500 text-white rounded hover:bg-green-600"
          >
            {showAddForm ? 'Cancel' : 'Add Document'}
          </button>
          
          {showAddForm && (
            <div className="mt-4 p-4 border rounded-lg bg-gray-50">
              <div className="space-y-3">
                <input
                  type="text"
                  placeholder="Document title..."
                  value={newTitle}
                  onChange={(e) => setNewTitle(e.target.value)}
                  className="w-full p-2 border rounded focus:outline-none focus:ring-2 focus:ring-green-500"
                />
                <textarea
                  placeholder="Document content..."
                  value={newContent}
                  onChange={(e) => setNewContent(e.target.value)}
                  className="w-full p-2 border rounded focus:outline-none focus:ring-2 focus:ring-green-500 min-h-[100px]"
                />
                <button
                  onClick={handleAddDocument}
                  disabled={isAdding || !newTitle.trim() || !newContent.trim()}
                  className="px-4 py-2 bg-green-500 text-white rounded hover:bg-green-600 disabled:bg-gray-300"
                >
                  {isAdding ? 'Adding...' : 'Add Document'}
                </button>
              </div>
            </div>
          )}
        </div>

        {/* Search Interface */}
        <div className="flex gap-3 mb-6">
          <input
            type="text"
            value={query}
            onChange={(e) => setQuery(e.target.value)}
            onKeyPress={(e) => e.key === 'Enter' && handleSearch()}
            placeholder="Search for documents using semantic meaning..."
            className="flex-1 p-3 border rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500"
            disabled={isSearching}
          />
          <button
            onClick={handleSearch}
            disabled={isSearching || !query.trim()}
            className="px-6 py-3 bg-blue-500 text-white rounded-lg hover:bg-blue-600 disabled:bg-gray-300"
          >
            {isSearching ? 'Searching...' : 'Search'}
          </button>
        </div>
      </div>

      <div className="grid md:grid-cols-2 gap-8">
        {/* Search Results */}
        <div>
          <h2 className="text-xl font-semibold mb-4">
            Search Results {results.length > 0 && `(${results.length})`}
          </h2>
          
          {results.length === 0 && query && !isSearching && (
            <p className="text-gray-500">No results found for "{query}"</p>
          )}
          
          <div className="space-y-4">
            {results.map((result) => (
              <div key={result.id} className="border rounded-lg p-4">
                <div className="flex justify-between items-start mb-2">
                  <h3 className="font-semibold">{result.title}</h3>
                  <span className="text-sm bg-blue-100 px-2 py-1 rounded">
                    {(result.similarity * 100).toFixed(1)}% match
                  </span>
                </div>
                <div 
                  className="text-gray-700 text-sm mb-2"
                  dangerouslySetInnerHTML={{
                    __html: highlightText(
                      result.content.slice(0, 200) + (result.content.length > 200 ? '...' : ''),
                      query
                    )
                  }}
                />
                <div className="text-xs text-gray-500">
                  {new Date(result.created_at).toLocaleDateString()}
                </div>
              </div>
            ))}
          </div>
        </div>

        {/* All Documents */}
        <div>
          <h2 className="text-xl font-semibold mb-4">
            All Documents ({documents.length})
          </h2>
          
          <div className="space-y-4 max-h-96 overflow-y-auto">
            {documents.map((doc) => (
              <div key={doc.id} className="border rounded-lg p-3">
                <h4 className="font-medium mb-1">{doc.title}</h4>
                <p className="text-gray-600 text-sm mb-2">
                  {doc.content.slice(0, 150)}...
                </p>
                <div className="text-xs text-gray-500">
                  {new Date(doc.created_at).toLocaleDateString()}
                </div>
              </div>
            ))}
          </div>
        </div>
      </div>
    </div>
  );
}

Step 7: Use in App

import SemanticSearch from '@/components/SemanticSearch';

export default function Home() {
  return (
    <main className="min-h-screen bg-gray-50">
      <SemanticSearch />
    </main>
  );
}

Summary

OpenAI embeddings enable powerful semantic search capabilities that understand meaning beyond keyword matching. Combined with vector databases like PostgreSQL with pgvector, you can build intelligent document retrieval systems.


Share this post on:

Previous Post
Setting Up Drizzle ORM with PostgreSQL in Node.js
Next Post
Implementing DALL-E Image Generation in React