RAG-Anything Tutorial: Build a Multimodal Retrieval Pipeline for Text, Tables, Equations, and Images in Colab

The tutorial demonstrates how to build a retrieval pipeline using RAG-Anything that processes text, tables, equations, and images within Google Colab. The process starts by configuring the environment, installing dependencies, and securely entering an OpenAI API key to ensure the notebook runs safely.

Installing RAG-Anything Dependencies

The setup script installs the necessary libraries and fixes a known conflict with the Pillow package. It also imports modules for plotting, PDF generation, and OpenAI access.

import os
import re
import sys
import json
import time
import shutil
import hashlib
import asyncio
import inspect
import getpass
import subprocess
import importlib
import importlib.metadata
from pathlib import Path
from typing import List, Dict, Any
def run_shell(cmd, check=True):
   print(f"\n$ {cmd}")
   result = subprocess.run(cmd, shell=True, text=True)
   if check and result.returncode != 0:
       raise RuntimeError(f"Command failed: {cmd}")
   return result.returncode
print("=" * 80)
print("RAG-Anything Advanced Colab Tutorial")
print("=" * 80)
print("\n[1/10] Installing dependencies...")
for module_name in list(sys.modules):
   if module_name == "PIL" or module_name.startswith("PIL."):
       del sys.modules[module_name]
run_shell(
   'pip -q install -U '
   '"raganything[image,text]" '
   '"openai>=1.0.0" '
   '"python-dotenv" '
   '"reportlab" '
   '"pandas" '
   '"matplotlib" '
   '"tabulate"'
)
run_shell('pip -q install --no-cache-dir --force-reinstall "pillow==11.3.0"')
for module_name in list(sys.modules):
   if module_name == "PIL" or module_name.startswith("PIL."):
       del sys.modules[module_name]
importlib.invalidate_caches()
try:
   print("Pillow version:", importlib.metadata.version("Pillow"))
except Exception as e:
   print("Could not read Pillow version:", repr(e))
print("\n[2/10] Importing libraries...")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from openai import AsyncOpenAI
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.utils import EmbeddingFunc
print("Imports successful.")

The script clears cached PIL modules before installing to prevent version conflicts, then reloads the caches. It verifies the Pillow version and imports tools for data handling and document generation.

Configuring Directories, Runtime Variables

The code defines paths for assets, output, logs, and working storage. It creates these folders if they do not exist and clears the working directory before starting.

print("\n[3/10] Preparing directories and runtime settings...")
BASE_DIR = Path("/content/raganything_advanced_tutorial") if Path("/content").exists() else Path.cwd() / "raganything_advanced_tutorial"
ASSET_DIR = BASE_DIR / "assets"
OUTPUT_DIR = BASE_DIR / "output"
WORKING_DIR = BASE_DIR / "rag_storage"
LOG_DIR = BASE_DIR / "logs"
RESET_STORAGE = True
RUN_FULL_DOCUMENT_PARSE = False
PARSER_FOR_FULL_PARSE = "mineru"
PARSE_METHOD = "auto"
for d in [BASE_DIR, ASSET_DIR, OUTPUT_DIR, WORKING_DIR, LOG_DIR]:
   d.mkdir(parents=True, exist_ok=True)
if RESET_STORAGE and WORKING_DIR.exists():
   shutil.rmtree(WORKING_DIR)
   WORKING_DIR.mkdir(parents=True, exist_ok=True)
os.environ["LOG_DIR"] = str(LOG_DIR)
os.environ["SUMMARY_LANGUAGE"] = "English"
os.environ["ENABLE_LLM_CACHE"] = "false"
os.environ["ENABLE_LLM_CACHE_FOR_EXTRACT"] = "false"
os.environ["MAX_ASYNC"] = "2"
os.environ["CHUNK_SIZE"] = "900"
os.environ["CHUNK_OVERLAP_SIZE"] = "120"
os.environ["TIMEOUT"] = "240"
for var in [
   "OPENAI_API_KEY",
   "OPENAI_ORG_ID",
   "OPENAI_ORGANIZATION",
   "OPENAI_PROJECT",
   "OPENAI_DEFAULT_HEADERS",
   "LLM_BINDING_API_KEY",
   "LLM_BINDING_HOST",
]:
   os.environ.pop(var, None)
print(f"Base directory: {BASE_DIR}")
print(f"Assets directory: {ASSET_DIR}")
print(f"Storage directory: {WORKING_DIR}")
print("\n[4/10] Entering OpenAI API key securely...")
def clean_api_key(raw_value: str) -> str:
   raw_value = str(raw_value or "").strip()
   raw_value = raw_value.replace("Bearer ", "").replace("bearer ", "").strip()
   raw_value = raw_value.strip("'").strip('"').strip("`").strip()
   if "=" in raw_value:
       raw_value = raw_value.split("=", 1)[1].strip().strip("'").strip('"').strip("`")
   raw_value = re.sub(r"\s+", "", raw_value)
   raw_value = raw_value.encode("ascii", errors="ignore").decode("ascii").strip()
   return raw_value
OPENAI_API_KEY_RAW = getpass.getpass("Paste your OpenAI API key here. Input is hidden: ")
OPENAI_API_KEY = clean_api_key(OPENAI_API_KEY_RAW)
if not OPENAI_API_KEY:
   raise ValueError(
       "No API key was captured. Paste the key into the hidden input box and press Enter."
   )
print("Captured key length:", len(OPENAI_API_KEY))
print("Captured key prefix:", OPENAI_API_KEY[:12] + "...")
print("Captured key suffix:", "..." + OPENAI_API_KEY[-6:])
LLM_MODEL = "gpt-4o-mini"
VISION_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
os.environ["LLM_MODEL"] = LLM_MODEL
os.environ["VISION_MODEL"] = VISION_MODEL
os.environ["EMBEDDING_MODEL"] = EMBEDDING_MODEL
os.environ["EMBEDDING_DIM"] = str(EMBEDDING_DIM)
print("Testing OpenAI chat API with the captured key...")
try:
   test_response = await openai_client.chat.completions.create(
       model=LLM_MODEL,
       messages=[{"role": "user", "content": "Reply with exactly: ok"}],
       temperature=0,
   )
   print("Chat API test response:", test_response.choices[0].message.content)
except Exception as e:
   raise RuntimeError(
       "The key was captured, but OpenAI rejected the request or the account/model access failed. "
       "Check billing, project permissions, and make sure this is an OpenAI Platform API key."
   ) from e
print("\nTesting OpenAI embedding API...")
try:
   test_embedding = await openai_client.embeddings.create(
       model=EMBEDDING_MODEL,
       input=["RAG-Anything embedding test"],
   )
   print("Embedding vector length:", len(test_embedding.data[0].embedding))
except Exception as e:
   raise RuntimeError(
       "Chat worked, but embeddings failed. Make sure your API key has permission for embeddings."
   ) from e
print("OpenAI API key is working.")
print(f"Chat model: {LLM_MODEL}")
print(f"Vision model: {VISION_MODEL}")
print(f"Embedding model: {EMBEDDING_MODEL}")
print(f"Embedding dimension: {EMBEDDING_DIM}")

The script sets environment variables for language, caching, and chunking. It clears any existing OpenAI environment variables to prevent conflicts. Users must paste their API key into a hidden box. The code strips formatting characters like “Bearer ” and quotes before validating the key.

It then tests the connection by sending a simple chat request and generating an embedding vector. If either step fails, the script stops and displays an error message pointing to billing or permission issues.

Generating a Synthetic Multimodal Report

The tutorial creates sample data to test the system. It generates a table and a chart showing query volume and accuracy trends over six months.

print("\n[5/10] Creating a synthetic multimodal report...")

monthly_data = pd.DataFrame(

{

"Month": ["Jan", "Feb", "Mar", "Apr", "May", "Jun"],

"Query Volume": [1200, 1700, 2100, 2600, 3300, 4100],

"Hybrid Accuracy": [0.71, 0.74, 0.79, 0.83, 0.87, 0.91],

"Average Latency ms": [980, 920, 850, 790, 760, 730],

}

)

table_md = monthly_data.to_markdown(index=False)

plt.figure(figsize=(8, 4.8))

plt.plot(monthly_data["Month"], monthly_data["Query Volume"], marker="o", label="Query Volume")

plt.plot(monthly_data["Month"], monthly_data["Hybrid Accuracy"] * 4000, marker="s", label="Hybrid Accuracy scaled")

plt.title("Multimodal RAG Usage and Quality Trend")

plt.xlabel("Month")

plt.ylabel("Volume / Scaled Accuracy")

plt.legend()

plt.grid(True, alpha=0.3)

plt.text(

0.02,

0.95,

"Synthetic figure: usage rises while latency falls",

transform=plt.gca().transAxes,

fontsize=9,

verticalalignment="top",

bbox=dict(boxstyle="round", alpha=0.15),

)

chart_path = ASSET_DIR / "raganything_quality_trend.png"

plt.tight_layout()

plt
Source Read original →
Related reading
Theker just raised $85M to build the factory robot that doesn’t specialize in anything
NVIDIA cuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab
NVIDIA garak Tutorial: Build a Complete Defensive LLM Red-Teaming Workflow with Custom Probes and Detectors
The SignalThe Signal: Edition 02Read this edition →Every Friday: the one AI story that actually mattered, plus the tools worth your time.

AM
AI Maestro is an independent British AI publication. We test what we recommend, and we write it the way we would say it. More about us

RAG-Anything Tutorial: Build a Multimodal Retrieval Pipeline for Text, Tables, Equations, and Images in Colab

Installing RAG-Anything Dependencies

Configuring Directories, Runtime Variables

Generating a Synthetic Multimodal Report

`Empowering Businesses with AI: Smart Tools, Smarter Business Decisions.`

`follow us`

`Popular Tag`

`Popular Post`

`llm-coding-agent 0.1a0`

`Meet Alibaba’s Page Agent:…`

`Microsoft launches $2.5 billion…`

Installing RAG-Anything Dependencies

Configuring Directories, Runtime Variables

Generating a Synthetic Multimodal Report

Related articles

Empowering Businesses with AI: Smart Tools, Smarter Business Decisions.

follow us

Popular Tag

Popular Post

llm-coding-agent 0.1a0

Meet Alibaba’s Page Agent:…

Microsoft launches $2.5 billion…

`Related articles`

`Empowering Businesses with AI: Smart Tools, Smarter Business Decisions.`

`follow us`

`Popular Tag`

`Popular Post`

`llm-coding-agent 0.1a0`

`Meet Alibaba’s Page Agent:…`

`Microsoft launches $2.5 billion…`