feat: Add MCP server for generating notional data

- Implement data generation server with support for insurance data - Add comprehensive test suite with 16 test cases - Support custom schemas and data relationships - Use faker, mimesis, numpy, and SDV for realistic data - Pass all type checks with pyright and lint checks with ruff Co-Authored-By: alexander@anthropic.com <alexander@anthropic.com>
2026-04-26 15:55:39 +02:00 · 2024-12-11 22:48:15 +00:00
parent 7d2f7d22b5
commit e0cfb6c06d
11 changed files with 2297 additions and 0 deletions
--- a/src/datagen/README.md
+++ b/src/datagen/README.md
@@ -0,0 +1,42 @@
+# MCP Data Generation Server
+
+This server implements the Model Context Protocol (MCP) to provide notional data generation capabilities using Python libraries including Faker, Mimesis, NumPy, and SDV.
+
+## Features
+
+- Generate synthetic data tables based on specified schemas and parameters
+- Support for multiple data generation libraries (Faker, Mimesis, SDV)
+- Configurable row counts and column specifications
+- Export data in CSV format
+
+## Installation
+
+```bash
+pip install mcp-server-datagen
+```
+
+## Usage
+
+The server exposes MCP tools for generating notional data:
+
+- `generate_tables`: Generate multiple related tables based on a schema
+- `define_schema`: Define table schemas with column specifications
+- `export_csv`: Export generated data to CSV files
+
+## Development
+
+1. Create virtual environment and install dependencies:
+```bash
+uv venv
+uv pip install -e ".[dev]"
+```
+
+2. Run type checking:
+```bash
+uv run --frozen pyright
+```
+
+3. Build package:
+```bash
+uv build
+```
--- a/src/datagen/pyproject.toml
+++ b/src/datagen/pyproject.toml
@@ -0,0 +1,30 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "mcp-server-datagen"
+version = "0.1.0"
+description = "MCP server for generating notional data using Python libraries"
+requires-python = ">=3.12"
+dependencies = [
+    "faker>=20.1.0",
+    "mimesis>=13.1.0",
+    "numpy>=1.26.0",
+    "sdv>=1.5.0",
+    "pandas>=2.1.0",
+    "mcp>=1.0.0",
+    "pydantic>=2.0.0"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "black>=23.0.0",
+    "pyright>=1.1.0"
+]
+
+[dependency-groups]
+dev = [
+    "ruff>=0.8.2",
+]
--- a/src/datagen/src/mcp_server_datagen/init.py
+++ b/src/datagen/src/mcp_server_datagen/init.py
@@ -0,0 +1,3 @@
+"""MCP server for generating notional data."""
+
+__version__ = "0.1.0"
--- a/src/datagen/src/mcp_server_datagen/main.py
+++ b/src/datagen/src/mcp_server_datagen/main.py
@@ -0,0 +1,7 @@
+"""Main entry point for the data generation server."""
+
+import asyncio
+from mcp_server_datagen.server import serve
+
+if __name__ == "__main__":
+    asyncio.run(serve())
--- a/src/datagen/src/mcp_server_datagen/generators.py
+++ b/src/datagen/src/mcp_server_datagen/generators.py
@@ -0,0 +1,75 @@
+"""Data generation utilities using Faker, Mimesis, NumPy, and SDV."""
+
+from typing import Any, Dict, List, cast
+from faker import Faker
+from mimesis import Generic
+import numpy as np
+from numpy.typing import NDArray
+from .synthetic import SyntheticDataGenerator
+
+
+class DataGenerator:
+    """Handles data generation using multiple libraries."""
+
+    def __init__(self):
+        self.faker = Faker()
+        self.generic = Generic()
+        self.synthetic = SyntheticDataGenerator()
+
+    async def generate_table(
+        self,
+        name: str,
+        schema: Dict[str, Dict[str, Any]],
+        rows: int = 1000
+    ) -> Dict[str, List[Any]]:
+        """Generate a table of data based on the provided schema.
+
+        Args:
+            name: Name of the table
+            schema: Column definitions and parameters
+            rows: Number of rows to generate
+
+        Returns:
+            Dictionary containing the generated data
+        """
+        data: Dict[str, List[Any]] = {}
+
+        # Use SDV for generating correlated data
+        if any(col_spec.get("correlated", False) for col_spec in schema.values()):
+            return await self.synthetic.generate_synthetic_data(name, schema, rows)
+
+        # Generate individual columns using specified generators
+        for col_name, col_spec in schema.items():
+            generator = col_spec.get("generator", "faker")
+            data_type = col_spec.get("type", "string")
+
+            if generator == "faker":
+                data[col_name] = [
+                    getattr(self.faker, data_type)()
+                    for _ in range(rows)
+                ]
+            elif generator == "mimesis":
+                data[col_name] = [
+                    getattr(self.generic, data_type)()
+                    for _ in range(rows)
+                ]
+            elif generator == "numpy":
+                if data_type == "int":
+                    int_values: NDArray[np.int64] = np.random.randint(
+                        low=col_spec.get("min", 0),
+                        high=col_spec.get("max", 100),
+                        size=rows,
+                        dtype=np.int64
+                    )
+                    data[col_name] = cast(List[Any], int_values.tolist())
+                elif data_type == "float":
+                    min_val = float(col_spec.get("min", 0.0))
+                    max_val = float(col_spec.get("max", 1.0))
+                    float_values = np.random.uniform(
+                        low=min_val,
+                        high=max_val,
+                        size=rows
+                    ).astype(np.float64)
+                    data[col_name] = cast(List[Any], float_values.tolist())
+
+        return data
--- a/src/datagen/src/mcp_server_datagen/server.py
+++ b/src/datagen/src/mcp_server_datagen/server.py
@@ -0,0 +1,252 @@
+import asyncio
+import json
+from typing import Any, Dict, List, Sequence
+
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+from mcp.types import Tool, TextContent, ImageContent, EmbeddedResource
+from mcp.shared.exceptions import McpError
+
+from mcp_server_datagen.synthetic import SyntheticDataGenerator
+
+
+class DataGenServer:
+    """MCP server for generating notional data."""
+
+    def __init__(self):
+        self.default_schemas = {
+            "customers": {
+                "customer_id": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 10000,
+                    "max": 99999
+                },
+                "first_name": {
+                    "type": "first_name",
+                    "generator": "faker"
+                },
+                "last_name": {
+                    "type": "last_name",
+                    "generator": "faker"
+                },
+                "email": {
+                    "type": "email",
+                    "generator": "faker"
+                },
+                "phone": {
+                    "type": "phone_number",
+                    "generator": "faker"
+                },
+                "address": {
+                    "type": "address",
+                    "generator": "faker"
+                },
+                "date_of_birth": {
+                    "type": "date_of_birth",
+                    "generator": "faker"
+                },
+                "credit_score": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 300,
+                    "max": 850
+                }
+            },
+            "policies": {
+                "policy_id": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 100000,
+                    "max": 999999
+                },
+                "customer_id": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 10000,
+                    "max": 99999,
+                    "correlated": True
+                },
+                "policy_type": {
+                    "type": "category",
+                    "generator": "numpy",
+                    "categories": ["auto", "home", "life", "health"]
+                },
+                "start_date": {
+                    "type": "date_this_decade",
+                    "generator": "faker"
+                },
+                "end_date": {
+                    "type": "date_this_decade",
+                    "generator": "faker"
+                },
+                "premium": {
+                    "type": "float",
+                    "generator": "numpy",
+                    "min": 500.0,
+                    "max": 5000.0
+                },
+                "coverage_amount": {
+                    "type": "float",
+                    "generator": "numpy",
+                    "min": 50000.0,
+                    "max": 1000000.0
+                },
+                "status": {
+                    "type": "category",
+                    "generator": "numpy",
+                    "categories": ["active", "expired", "cancelled", "pending"]
+                }
+            },
+            "claims": {
+                "claim_id": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 1000000,
+                    "max": 9999999
+                },
+                "policy_id": {
+                    "type": "int",
+                    "generator": "numpy",
+                    "min": 100000,
+                    "max": 999999,
+                    "correlated": True
+                },
+                "date_filed": {
+                    "type": "date_this_year",
+                    "generator": "faker"
+                },
+                "incident_date": {
+                    "type": "date_this_year",
+                    "generator": "faker"
+                },
+                "claim_type": {
+                    "type": "category",
+                    "generator": "numpy",
+                    "categories": ["accident", "theft", "natural_disaster", "medical", "property_damage"]
+                },
+                "amount_claimed": {
+                    "type": "float",
+                    "generator": "numpy",
+                    "min": 1000.0,
+                    "max": 100000.0
+                },
+                "status": {
+                    "type": "category",
+                    "generator": "numpy",
+                    "categories": ["pending", "approved", "denied", "in_review"]
+                },
+                "description": {
+                    "type": "text",
+                    "generator": "faker"
+                }
+            }
+        }
+        self.generator = SyntheticDataGenerator()
+        self.generator.default_schemas = self.default_schemas
+
+    async def list_tools(self) -> List[Tool]:
+        """List available data generation tools."""
+        return [
+            Tool(
+                name="generate_tables",
+                description="Generate multiple tables of notional data",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "tables": {
+                            "type": "array",
+                            "items": {"type": "string"}
+                        },
+                        "rows": {"type": "integer", "minimum": 1},
+                        "schemas": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "type": "object",
+                                "additionalProperties": {
+                                    "type": "object",
+                                    "properties": {
+                                        "type": {"type": "string"},
+                                        "generator": {"type": "string"},
+                                        "min": {"type": "number"},
+                                        "max": {"type": "number"},
+                                        "categories": {
+                                            "type": "array",
+                                            "items": {"type": "string"}
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "required": ["tables"]
+                }
+            )
+        ]
+
+    async def handle_generate_tables(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle generate_tables tool requests."""
+        tables = params.get("tables", [])
+        rows = params.get("rows", 1000)
+        custom_schemas = params.get("schemas", {})
+
+        if rows <= 0:
+            raise ValueError("Row count must be positive")
+
+        results = {}
+        try:
+            for table_name in tables:
+                if table_name not in self.default_schemas and table_name not in custom_schemas:
+                    raise ValueError(f"Unknown table: {table_name}")
+
+                # Use custom schema if provided, otherwise use default
+                schema = custom_schemas.get(table_name, self.default_schemas.get(table_name, {}))
+                data = await self.generator.generate_synthetic_data(
+                    table_name=table_name,
+                    schema=schema,
+                    rows=rows
+                )
+                results[table_name] = data
+
+            return results
+
+        except ValueError as e:
+            # Re-raise validation errors directly
+            raise e
+        except Exception as e:
+            # Wrap unexpected errors in McpError
+            raise McpError(f"Error generating data: {str(e)}")
+
+
+async def serve() -> None:
+    """Start the MCP server."""
+    server = Server("mcp-datagen")
+    datagen_server = DataGenServer()
+
+    @server.list_tools()
+    async def list_tools() -> List[Tool]:
+        """List available data generation tools."""
+        return await datagen_server.list_tools()
+
+    @server.call_tool()
+    async def call_tool(
+        name: str, arguments: Dict[str, Any]
+    ) -> Sequence[TextContent | ImageContent | EmbeddedResource]:
+        """Handle tool calls."""
+        if name == "generate_tables":
+            result = await datagen_server.handle_generate_tables(arguments)
+            return [
+                TextContent(
+                    type="text",
+                    text=json.dumps({"tables": result}, indent=2)
+                )
+            ]
+        raise McpError(f"Unknown tool: {name}")
+
+    options = server.create_initialization_options()
+    async with stdio_server() as (read_stream, write_stream):
+        await server.run(read_stream, write_stream, options)
+
+
+if __name__ == "__main__":
+    asyncio.run(serve())
--- a/src/datagen/src/mcp_server_datagen/synthetic.py
+++ b/src/datagen/src/mcp_server_datagen/synthetic.py
@@ -0,0 +1,328 @@
+"""Synthetic data generation using SDV."""
+
+from typing import Dict, List, Any, Set
+import pandas as pd
+import numpy as np
+from faker import Faker
+from mimesis import Generic
+from sdv.single_table import GaussianCopulaSynthesizer
+from sdv.metadata import SingleTableMetadata
+from datetime import datetime, timedelta
+
+
+class SyntheticDataGenerator:
+    """Handles synthetic data generation using SDV."""
+
+    def __init__(self):
+        """Initialize the generator."""
+        self.synthesizers: Dict[str, GaussianCopulaSynthesizer] = {}
+        self.metadata: Dict[str, SingleTableMetadata] = {}
+        self.faker = Faker()
+        self.mimesis = Generic()
+        # Store generated IDs for relationships
+        self.generated_ids: Dict[str, Set[int]] = {}
+        # Initialize empty sets for each table
+        self.generated_ids["customers"] = set()
+        self.generated_ids["policies"] = set()
+        self.generated_ids["claims"] = set()
+        self.default_schemas: Dict[str, Dict[str, Dict[str, Any]]] = {}
+        # Counter for ID generation
+        self.id_counters: Dict[str, int] = {}
+
+    def create_metadata(
+        self,
+        table_name: str,
+        schema: Dict[str, Dict[str, Any]]
+    ) -> SingleTableMetadata:
+        """Create metadata for a table based on schema."""
+        metadata = SingleTableMetadata()
+
+        for col_name, col_spec in schema.items():
+            data_type = col_spec.get("type", "string")
+            sdtype = self._map_type_to_sdtype(data_type)
+            metadata.add_column(
+                column_name=col_name,
+                sdtype=sdtype
+            )
+
+        return metadata
+
+    def _map_type_to_sdtype(self, data_type: str) -> str:
+        """Map data type to SDV type."""
+        type_mapping = {
+            "string": "categorical",
+            "int": "numerical",
+            "float": "numerical",
+            "datetime": "datetime",
+            "boolean": "boolean",
+            "category": "categorical"
+        }
+        return type_mapping.get(data_type, "categorical")
+
+    def _generate_faker_value(self, generator: str) -> Any:
+        """Generate value using Faker."""
+        if not generator.startswith("faker."):
+            return None
+
+        method_name = generator.split(".", 1)[1]
+        if hasattr(self.faker, method_name):
+            return getattr(self.faker, method_name)()
+        return None
+
+    def _generate_mimesis_value(self, generator: str) -> Any:
+        """Generate value using Mimesis."""
+        if not generator.startswith("mimesis."):
+            return None
+
+        category, method = generator.split(".", 1)[1].split(".")
+        if hasattr(self.mimesis, category):
+            category_instance = getattr(self.mimesis, category)
+            if hasattr(category_instance, method):
+                return getattr(category_instance, method)()
+        return None
+
+    def _generate_unique_id(
+        self,
+        table_name: str,
+        col_spec: Dict[str, Any]
+    ) -> int:
+        """Generate a unique ID for a table using a hybrid sequential-random approach."""
+        min_val = col_spec.get("min", 1)
+        max_val = col_spec.get("max", 1000000)
+        range_size = max_val - min_val + 1
+
+        if table_name not in self.id_counters:
+            self.id_counters[table_name] = 0
+        if table_name not in self.generated_ids:
+            self.generated_ids[table_name] = set()
+
+        # Calculate a random offset within a smaller window
+        window_size = max(1, range_size // 1000)  # Use 0.1% of range as window
+        attempts = 0
+        max_attempts = 10  # Limit retries to avoid infinite loops
+
+        while attempts < max_attempts:
+            base = min_val + (self.id_counters[table_name] * window_size)
+            offset = np.random.randint(0, window_size)
+            new_id = base + offset
+
+            # Handle wraparound
+            if new_id > max_val:
+                self.id_counters[table_name] = 0
+                new_id = min_val + np.random.randint(0, window_size)
+
+            # Check if ID is unique
+            if new_id not in self.generated_ids[table_name]:
+                self.generated_ids[table_name].add(new_id)
+                self.id_counters[table_name] += 1
+                return new_id
+
+            attempts += 1
+
+        # If we couldn't find a unique ID in the current window, move to next window
+        self.id_counters[table_name] += 1
+        return self._generate_unique_id(table_name, col_spec)  # Recursive call with new window
+
+    def _generate_correlated_id(self, parent_table: str) -> int:
+        """Generate a correlated ID from a parent table."""
+        if not self.generated_ids.get(parent_table):
+            raise ValueError(f"No IDs available for parent table {parent_table}")
+        parent_ids = list(self.generated_ids[parent_table])
+        return np.random.choice(parent_ids)
+
+    def _extract_parent_table(self, column_name: str) -> str:
+        """Extract parent table name from column name."""
+        if not column_name.endswith("_id"):
+            raise ValueError(f"Column {column_name} is not a foreign key")
+        # Handle both singular and plural forms with special cases
+        table_name = column_name[:-3]  # Remove _id
+        # Handle irregular plurals
+        irregular_plurals = {
+            "policy": "policies",
+            "company": "companies",
+            "category": "categories"
+        }
+        if table_name in irregular_plurals:
+            return irregular_plurals[table_name]
+        # Handle regular plurals
+        if not table_name.endswith('s'):
+            table_name += 's'
+        return table_name
+
+    def _clear_generated_ids(self, table_name: str) -> None:
+        """Clear generated IDs for a table."""
+        if table_name in self.generated_ids:
+            del self.generated_ids[table_name]
+
+    async def fit_synthesizer(
+        self,
+        table_name: str,
+        schema: Dict[str, Dict[str, Any]]
+    ) -> None:
+        """Fit a synthesizer for the given table schema."""
+        metadata = self.create_metadata(table_name, schema)
+        fitting_size = min(100, 1000)  # Use a small sample size for fitting
+
+        # Generate sample data for fitting
+        sample_data = {}
+        for col_name, col_spec in schema.items():
+            col_type = col_spec["type"]
+            is_correlated = col_spec.get("correlated", False)
+
+            if is_correlated and col_name.endswith("_id"):
+                # For correlated fields, use IDs from parent table
+                parent_table = self._extract_parent_table(col_name)
+                if parent_table not in self.generated_ids:
+                    raise ValueError(f"Parent table {parent_table} must be generated before {table_name}")
+                parent_ids = list(self.generated_ids[parent_table])
+                sample_data[col_name] = [
+                    np.random.choice(parent_ids) for _ in range(fitting_size)
+                ]
+            elif col_type == "int":
+                min_val = col_spec.get("min", 0)
+                max_val = col_spec.get("max", 100)
+                if col_name.endswith("_id"):
+                    # Generate unique IDs for primary keys
+                    unique_ids = set()
+                    while len(unique_ids) < fitting_size:
+                        unique_ids.add(self._generate_unique_id(min_val, max_val))
+                    sample_data[col_name] = list(unique_ids)
+                else:
+                    sample_data[col_name] = [
+                        np.random.randint(min_val, max_val + 1)
+                        for _ in range(fitting_size)
+                    ]
+            elif col_type == "float":
+                min_val = col_spec.get("min", 0.0)
+                max_val = col_spec.get("max", 1.0)
+                sample_data[col_name] = [
+                    np.random.uniform(min_val, max_val)
+                    for _ in range(fitting_size)
+                ]
+            elif col_type == "category":
+                categories = col_spec.get("categories", [])
+                sample_data[col_name] = [
+                    np.random.choice(categories)
+                    for _ in range(fitting_size)
+                ]
+            elif col_type == "datetime":
+                if "generator" in col_spec:
+                    generator_str = col_spec["generator"]
+                    if generator_str.startswith("faker."):
+                        sample_data[col_name] = [
+                            self._generate_faker_value(generator_str)
+                            for _ in range(fitting_size)
+                        ]
+                    elif generator_str.startswith("mimesis."):
+                        sample_data[col_name] = [
+                            self._generate_mimesis_value(generator_str)
+                            for _ in range(fitting_size)
+                        ]
+                else:
+                    # Default to current year's range
+                    current_year = datetime.now().year
+                    start = datetime(current_year, 1, 1)
+                    end = datetime(current_year, 12, 31)
+                    sample_data[col_name] = [
+                        start + timedelta(
+                            seconds=np.random.randint(0, int((end - start).total_seconds()))
+                        )
+                        for _ in range(fitting_size)
+                    ]
+            elif col_type == "string":
+                if "generator" in col_spec:
+                    generator_str = col_spec["generator"]
+                    if generator_str.startswith("faker."):
+                        sample_data[col_name] = [
+                            self._generate_faker_value(generator_str)
+                            for _ in range(fitting_size)
+                        ]
+                    elif generator_str.startswith("mimesis."):
+                        sample_data[col_name] = [
+                            self._generate_mimesis_value(generator_str)
+                            for _ in range(fitting_size)
+                        ]
+                else:
+                    # Default to random string
+                    sample_data[col_name] = [
+                        ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), size=10))
+                        for _ in range(fitting_size)
+                    ]
+
+        # Create DataFrame and fit synthesizer
+        df = pd.DataFrame(sample_data)
+        synthesizer = GaussianCopulaSynthesizer(metadata)
+        synthesizer.fit(df)
+        self.synthesizers[table_name] = synthesizer
+
+    async def generate_synthetic_data(
+        self,
+        table_name: str,
+        schema: Dict[str, Dict[str, Any]],
+        rows: int = 1000
+    ) -> Dict[str, List[Any]]:
+        """Generate synthetic data for a table."""
+        # Initialize result dictionary with empty lists for all columns
+        result: Dict[str, List[Any]] = {col_name: [] for col_name in schema.keys()}
+
+        # Generate parent tables first if needed
+        parent_tables = set()
+        for col_name, col_spec in schema.items():
+            if col_spec.get("correlated", False):
+                parent_table = self._extract_parent_table(col_name)
+                parent_tables.add((parent_table, col_name))
+
+        # Generate parent table data if not already generated
+        for parent_table, col_name in parent_tables:
+            if parent_table not in self.generated_ids or not self.generated_ids[parent_table]:
+                if hasattr(self, 'default_schemas') and parent_table in self.default_schemas:
+                    parent_schema = self.default_schemas[parent_table]
+                    await self.generate_synthetic_data(parent_table, parent_schema, rows)
+                else:
+                    raise ValueError(f"Parent table {parent_table} schema not found")
+
+        # Generate data for each column
+        for _ in range(rows):
+            for col_name, col_spec in schema.items():
+                col_type = col_spec["type"]
+                value = None
+
+                if col_name.endswith("_id") and not col_spec.get("correlated", False):
+                    # Generate unique ID
+                    value = self._generate_unique_id(table_name, col_spec)
+                elif col_spec.get("correlated", False):
+                    # Generate correlated ID from parent table
+                    parent_table = self._extract_parent_table(col_name)
+                    value = self._generate_correlated_id(parent_table)
+                elif col_type == "string":
+                    if "generator" in col_spec:
+                        if col_spec["generator"].startswith("faker."):
+                            value = self._generate_faker_value(col_spec["generator"])
+                        elif col_spec["generator"].startswith("mimesis."):
+                            value = self._generate_mimesis_value(col_spec["generator"])
+                    elif "categories" in col_spec:
+                        value = np.random.choice(col_spec["categories"])
+                    else:
+                        value = self._generate_faker_value("faker.word")
+                elif col_type == "int":
+                    value = np.random.randint(col_spec.get("min", 0), col_spec.get("max", 100))
+                elif col_type == "float":
+                    value = np.random.uniform(col_spec.get("min", 0.0), col_spec.get("max", 1.0))
+                elif col_type == "datetime":
+                    if "generator" in col_spec:
+                        value = self._generate_faker_value(col_spec["generator"])
+                    else:
+                        value = self._generate_faker_value("faker.date_time_this_decade")
+                elif col_type == "category":
+                    value = np.random.choice(col_spec["categories"])
+
+                result[col_name].append(value)
+
+        # Store generated IDs for correlated columns
+        for col_name, values in result.items():
+            if col_name.endswith("_id") and not schema[col_name].get("correlated", False):
+                if table_name not in self.generated_ids:
+                    self.generated_ids[table_name] = set()
+                self.generated_ids[table_name].update(values)
+
+        return result
--- a/src/datagen/tests/unit/test_insurance_data.py
+++ b/src/datagen/tests/unit/test_insurance_data.py
@@ -0,0 +1,168 @@
+"""Unit tests for insurance-specific data generation."""
+import pytest
+import pandas as pd
+import numpy as np
+from mcp_server_datagen.synthetic import SyntheticDataGenerator
+
+
+@pytest.fixture
+def data_generator(customers_schema, policies_schema, claims_schema):
+    """Create a data generator instance for testing."""
+    generator = SyntheticDataGenerator()
+    generator.default_schemas = {
+        "customers": customers_schema,
+        "policies": policies_schema,
+        "claims": claims_schema
+    }
+    return generator
+
+
+@pytest.fixture
+def customers_schema():
+    """Create the customers table schema."""
+    return {
+        "customer_id": {"type": "int", "min": 10000, "max": 99999},
+        "first_name": {"type": "string", "generator": "faker.first_name"},
+        "last_name": {"type": "string", "generator": "faker.last_name"},
+        "email": {"type": "string", "generator": "faker.email"},
+        "phone": {"type": "string", "generator": "faker.phone_number"},
+        "address": {"type": "string", "generator": "faker.address"},
+        "date_of_birth": {"type": "datetime", "generator": "faker.date_of_birth"},
+        "credit_score": {"type": "int", "min": 300, "max": 850}
+    }
+
+
+@pytest.fixture
+def policies_schema():
+    """Create the policies table schema."""
+    return {
+        "policy_id": {"type": "int", "min": 100000, "max": 999999},
+        "customer_id": {"type": "int", "min": 10000, "max": 99999, "correlated": True},
+        "policy_type": {"type": "category", "categories": ["auto", "home", "life", "health"]},
+        "start_date": {"type": "datetime"},
+        "end_date": {"type": "datetime"},
+        "premium": {"type": "float", "min": 500.0, "max": 5000.0},
+        "coverage_amount": {"type": "float", "min": 50000.0, "max": 1000000.0},
+        "status": {"type": "category", "categories": ["active", "expired", "cancelled", "pending"]}
+    }
+
+
+@pytest.fixture
+def claims_schema():
+    """Create the claims table schema."""
+    return {
+        "claim_id": {"type": "int", "min": 1000000, "max": 9999999},
+        "policy_id": {"type": "int", "min": 100000, "max": 999999, "correlated": True},
+        "date_filed": {"type": "datetime"},
+        "incident_date": {"type": "datetime"},
+        "claim_type": {"type": "category", "categories": [
+            "accident", "theft", "natural_disaster", "medical", "property_damage"
+        ]},
+        "amount_claimed": {"type": "float", "min": 1000.0, "max": 100000.0},
+        "status": {"type": "category", "categories": ["pending", "approved", "denied", "in_review"]},
+        "description": {"type": "string", "generator": "mimesis.text.text"}
+    }
+
+
+@pytest.mark.asyncio
+async def test_generate_customers_table(data_generator, customers_schema):
+    """Test generation of customers table with 10,000 rows."""
+    rows = 10000
+    data = await data_generator.generate_synthetic_data("customers", customers_schema, rows)
+
+    # Verify row count
+    assert all(len(values) == rows for values in data.values())
+
+    # Verify data types and ranges
+    assert all(isinstance(x, (int, np.integer)) for x in data["customer_id"])
+    assert all(10000 <= x <= 99999 for x in data["customer_id"])
+    assert all(300 <= x <= 850 for x in data["credit_score"])
+
+    # Verify Faker-generated fields
+    assert all(isinstance(x, str) and "@" in x for x in data["email"])
+    assert all(isinstance(x, str) and len(x) > 0 for x in data["first_name"])
+    assert all(isinstance(x, str) and len(x) > 0 for x in data["last_name"])
+
+
+@pytest.mark.asyncio
+async def test_generate_policies_table(data_generator, policies_schema):
+    """Test generation of policies table with 10,000 rows."""
+    rows = 10000
+    data = await data_generator.generate_synthetic_data("policies", policies_schema, rows)
+
+    # Verify row count
+    assert all(len(values) == rows for values in data.values())
+
+    # Verify data types and ranges
+    assert all(isinstance(x, (int, np.integer)) for x in data["policy_id"])
+    assert all(100000 <= x <= 999999 for x in data["policy_id"])
+    assert all(isinstance(x, (float, np.floating)) for x in data["premium"])
+    assert all(500.0 <= x <= 5000.0 for x in data["premium"])
+    assert all(50000.0 <= x <= 1000000.0 for x in data["coverage_amount"])
+
+    # Verify categorical fields
+    valid_types = ["auto", "home", "life", "health"]
+    valid_statuses = ["active", "expired", "cancelled", "pending"]
+    assert all(x in valid_types for x in data["policy_type"])
+    assert all(x in valid_statuses for x in data["status"])
+
+
+@pytest.mark.asyncio
+async def test_generate_claims_table(data_generator, claims_schema):
+    """Test generation of claims table with 10,000 rows."""
+    rows = 10000
+    data = await data_generator.generate_synthetic_data("claims", claims_schema, rows)
+
+    # Verify row count
+    assert all(len(values) == rows for values in data.values())
+
+    # Verify data types and ranges
+    assert all(isinstance(x, (int, np.integer)) for x in data["claim_id"])
+    assert all(1000000 <= x <= 9999999 for x in data["claim_id"])
+    assert all(isinstance(x, (float, np.floating)) for x in data["amount_claimed"])
+    assert all(1000.0 <= x <= 100000.0 for x in data["amount_claimed"])
+
+    # Verify categorical fields
+    valid_types = ["accident", "theft", "natural_disaster", "medical", "property_damage"]
+    valid_statuses = ["pending", "approved", "denied", "in_review"]
+    assert all(x in valid_types for x in data["claim_type"])
+    assert all(x in valid_statuses for x in data["status"])
+
+    # Verify Mimesis-generated descriptions
+    assert all(isinstance(x, str) and len(x) > 0 for x in data["description"])
+
+
+@pytest.mark.asyncio
+async def test_data_relationships(data_generator, customers_schema, policies_schema, claims_schema):
+    """Test relationships between tables."""
+    # Generate all three tables
+    customers = await data_generator.generate_synthetic_data("customers", customers_schema, 1000)
+    policies = await data_generator.generate_synthetic_data("policies", policies_schema, 2000)
+    claims = await data_generator.generate_synthetic_data("claims", claims_schema, 3000)
+
+    # Verify customer-policy relationship
+    customer_ids = set(customers["customer_id"])
+    policy_customer_ids = set(policies["customer_id"])
+    assert policy_customer_ids.issubset(customer_ids)
+
+    # Verify policy-claim relationship
+    policy_ids = set(policies["policy_id"])
+    claim_policy_ids = set(claims["policy_id"])
+    assert claim_policy_ids.issubset(policy_ids)
+
+
+@pytest.mark.asyncio
+async def test_csv_export(data_generator, customers_schema, tmp_path):
+    """Test CSV export functionality."""
+    rows = 100
+    data = await data_generator.generate_synthetic_data("customers", customers_schema, rows)
+
+    # Convert to DataFrame and save as CSV
+    df = pd.DataFrame(data)
+    csv_path = tmp_path / "customers.csv"
+    df.to_csv(csv_path, index=False)
+
+    # Read back and verify
+    df_read = pd.read_csv(csv_path)
+    assert len(df_read) == rows
+    assert all(col in df_read.columns for col in customers_schema.keys())
--- a/src/datagen/tests/unit/test_server.py
+++ b/src/datagen/tests/unit/test_server.py
@@ -0,0 +1,163 @@
+"""Unit tests for MCP data generation server."""
+import pytest
+from typing import Dict
+from mcp_server_datagen.server import DataGenServer
+
+
+@pytest.fixture
+def server():
+    """Create a server instance for testing."""
+    return DataGenServer()
+
+
+@pytest.mark.asyncio
+async def test_list_tools(server):
+    """Test that the server correctly lists available tools."""
+    tools = await server.list_tools()
+
+    # Verify tool list structure
+    assert isinstance(tools, list)
+    assert len(tools) > 0
+
+    # Verify required tools are present
+    tool_names = [tool.name for tool in tools]
+    assert "generate_tables" in tool_names
+
+    # Verify tool schema
+    generate_tool = next(tool for tool in tools if tool.name == "generate_tables")
+    assert generate_tool.inputSchema is not None
+    assert "tables" in generate_tool.inputSchema["properties"]
+    assert "rows" in generate_tool.inputSchema["properties"]
+    assert "schemas" in generate_tool.inputSchema["properties"]
+
+
+@pytest.mark.asyncio
+async def test_generate_insurance_tables(server):
+    """Test generation of insurance tables through the server."""
+    # Test parameters
+    params = {
+        "tables": ["customers", "policies", "claims"],
+        "rows": 100
+    }
+
+    # Call the tool
+    result = await server.handle_generate_tables(params)
+
+    # Verify result structure
+    assert isinstance(result, Dict)
+    assert all(table in result for table in params["tables"])
+
+    # Verify each table's data
+    for table_name, table_data in result.items():
+        assert isinstance(table_data, Dict)
+        assert len(next(iter(table_data.values()))) == params["rows"]
+
+
+@pytest.mark.asyncio
+async def test_generate_custom_schema(server):
+    """Test generation with custom schema."""
+    custom_schema = {
+        "test_table": {
+            "id": {"type": "int", "min": 1, "max": 100},
+            "name": {"type": "string", "generator": "faker.name"},
+            "description": {"type": "string", "generator": "mimesis.text.text"}
+        }
+    }
+
+    params = {
+        "tables": ["test_table"],
+        "rows": 50,
+        "schemas": custom_schema
+    }
+
+    result = await server.handle_generate_tables(params)
+
+    # Verify custom schema generation
+    assert "test_table" in result
+    table_data = result["test_table"]
+    assert len(table_data["id"]) == 50
+    assert all(1 <= x <= 100 for x in table_data["id"])
+    assert all(isinstance(x, str) for x in table_data["name"])
+    assert all(isinstance(x, str) for x in table_data["description"])
+
+
+@pytest.mark.asyncio
+async def test_invalid_table_name(server):
+    """Test error handling for invalid table names."""
+    params = {
+        "tables": ["nonexistent_table"],
+        "rows": 100
+    }
+
+    with pytest.raises(ValueError):
+        await server.handle_generate_tables(params)
+
+
+@pytest.mark.asyncio
+async def test_invalid_row_count(server):
+    """Test error handling for invalid row counts."""
+    params = {
+        "tables": ["customers"],
+        "rows": -1
+    }
+
+    with pytest.raises(ValueError):
+        await server.handle_generate_tables(params)
+
+
+@pytest.mark.asyncio
+async def test_large_dataset_generation(server):
+    """Test generation of large datasets (10,000 rows)."""
+    params = {
+        "tables": ["customers", "policies", "claims"],
+        "rows": 10000
+    }
+
+    result = await server.handle_generate_tables(params)
+
+    # Verify row counts
+    assert all(len(next(iter(table_data.values()))) == 10000
+              for table_data in result.values())
+
+    # Verify data relationships
+    customers = result["customers"]
+    policies = result["policies"]
+    claims = result["claims"]
+
+    # Customer IDs from policies should exist in customers
+    customer_ids = set(customers["customer_id"])
+    policy_customer_ids = set(policies["customer_id"])
+    assert policy_customer_ids.issubset(customer_ids)
+
+    # Policy IDs from claims should exist in policies
+    policy_ids = set(policies["policy_id"])
+    claim_policy_ids = set(claims["policy_id"])
+    assert claim_policy_ids.issubset(policy_ids)
+
+
+@pytest.mark.asyncio
+async def test_csv_export_format(server):
+    """Test that generated data can be exported as CSV."""
+    import pandas as pd
+    import tempfile
+    import os
+
+    params = {
+        "tables": ["customers"],
+        "rows": 100
+    }
+
+    result = await server.handle_generate_tables(params)
+
+    # Convert to DataFrame and save as CSV
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        csv_path = os.path.join(tmp_dir, "customers.csv")
+        df = pd.DataFrame(result["customers"])
+        df.to_csv(csv_path, index=False)
+
+
+        # Read back and verify
+        df_read = pd.read_csv(csv_path)
+        assert len(df_read) == 100
+        assert all(col in df_read.columns
+                  for col in server.default_schemas["customers"].keys())
--- a/src/datagen/tests/unit/test_synthetic.py
+++ b/src/datagen/tests/unit/test_synthetic.py
@@ -0,0 +1,114 @@
+"""Unit tests for synthetic data generation."""
+import pytest
+import numpy as np
+from mcp_server_datagen.synthetic import SyntheticDataGenerator
+
+
+@pytest.fixture
+def data_generator():
+    """Create a data generator instance for testing."""
+    return SyntheticDataGenerator()
+
+
+@pytest.fixture
+def sample_schema():
+    """Create a sample schema for testing."""
+    return {
+        "id": {"type": "int", "min": 1, "max": 1000},
+        "name": {"type": "string", "categories": ["Alice", "Bob", "Charlie"]},
+        "age": {"type": "int", "min": 18, "max": 100},
+        "score": {"type": "float", "min": 0.0, "max": 1.0},
+    }
+
+
+@pytest.mark.asyncio
+async def test_create_metadata(data_generator, sample_schema):
+    """Test metadata creation from schema."""
+    metadata = data_generator.create_metadata("test_table", sample_schema)
+
+    # Verify all columns are present
+    assert set(sample_schema.keys()) == set(metadata.columns.keys())
+
+    # Verify column types are mapped correctly
+    assert metadata.columns["id"]["sdtype"] == "numerical"
+    assert metadata.columns["name"]["sdtype"] == "categorical"
+    assert metadata.columns["age"]["sdtype"] == "numerical"
+    assert metadata.columns["score"]["sdtype"] == "numerical"
+
+
+@pytest.mark.asyncio
+async def test_generate_synthetic_data(data_generator, sample_schema):
+    """Test synthetic data generation."""
+    rows = 100
+    data = await data_generator.generate_synthetic_data("test_table", sample_schema, rows)
+
+    # Verify all columns are present
+    assert set(data.keys()) == set(sample_schema.keys())
+
+    # Verify number of rows
+    assert all(len(values) == rows for values in data.values())
+
+    # Verify data types and ranges
+    assert all(isinstance(x, (int, np.integer)) for x in data["id"])
+    assert all(1 <= x <= 1000 for x in data["id"])
+
+    assert all(isinstance(x, str) for x in data["name"])
+    assert all(x in ["Alice", "Bob", "Charlie"] for x in data["name"])
+
+    assert all(isinstance(x, (int, np.integer)) for x in data["age"])
+    assert all(18 <= x <= 100 for x in data["age"])
+
+    assert all(isinstance(x, (float, np.floating)) for x in data["score"])
+    assert all(0.0 <= x <= 1.0 for x in data["score"])
+
+
+@pytest.mark.asyncio
+async def test_generate_large_dataset(data_generator):
+    """Test generation of a large dataset."""
+    schema = {
+        "customer_id": {"type": "int", "min": 10000, "max": 99999},
+        "first_name": {"type": "string"},
+        "last_name": {"type": "string"},
+        "age": {"type": "int", "min": 18, "max": 100},
+        "credit_score": {"type": "int", "min": 300, "max": 850},
+    }
+
+    rows = 10000
+    data = await data_generator.generate_synthetic_data("customers", schema, rows)
+
+    # Verify row count
+    assert all(len(values) == rows for values in data.values())
+
+    # Verify data constraints
+    assert all(10000 <= x <= 99999 for x in data["customer_id"])
+    assert all(18 <= x <= 100 for x in data["age"])
+    assert all(300 <= x <= 850 for x in data["credit_score"])
+
+    # Verify unique IDs
+    assert len(set(data["customer_id"])) > rows * 0.95  # Allow for some duplicates due to random generation
+
+
+@pytest.mark.asyncio
+async def test_multiple_table_generation(data_generator):
+    """Test generation of multiple related tables."""
+    customers_schema = {
+        "customer_id": {"type": "int", "min": 1, "max": 1000},
+        "name": {"type": "string"},
+    }
+
+    policies_schema = {
+        "policy_id": {"type": "int", "min": 1, "max": 2000},
+        "customer_id": {"type": "int", "min": 1, "max": 1000, "correlated": True},
+        "premium": {"type": "float", "min": 500.0, "max": 5000.0},
+    }
+
+    # Generate both tables
+    customers = await data_generator.generate_synthetic_data("customers", customers_schema, 100)
+    policies = await data_generator.generate_synthetic_data("policies", policies_schema, 200)
+
+    # Verify referential integrity is maintained
+    customer_ids = set(customers["customer_id"])
+    policy_customer_ids = set(policies["customer_id"])
+
+    # All policy customer_ids should exist in customers table
+    assert policy_customer_ids.issubset(customer_ids)
--- a/src/datagen/uv.lock
+++ b/src/datagen/uv.lock