diff --git a/src/datagen/README.md b/src/datagen/README.md index 6df02268..fe389ef3 100644 --- a/src/datagen/README.md +++ b/src/datagen/README.md @@ -1,81 +1,443 @@ # MCP Data Generation Server -## Overview -A Model Context Protocol server for generating synthetic data across various domains. Leverages open-source libraries like Faker, Mimesis, NumPy, and SDV to create realistic, customizable datasets. - -## Tools - -### generate_data -Generate synthetic data based on specified schemas and parameters. - -**Input:** -- `rows` (integer, required): Number of rows to generate -- `tables` (array[string], required): List of table names to generate -- `schemas` (object, required): Schema definitions for each table - - Table schema format: - ```json - { - "column_name": { - "type": "string|integer|float|category|boolean", - "min": number, // Optional, for numeric types - "max": number, // Optional, for numeric types - "categories": ["value1", "value2"] // Required for category type - } - } - ``` - -**Returns:** -- Generated data tables matching the specified schemas +A Model Context Protocol (MCP) server for generating synthetic data with support for multiple data generation libraries including faker, mimesis, and numpy. ## Installation -Using `uv` (recommended): ```bash +# Using uv (recommended) uvx mcp-server-datagen -``` -Using `pip`: -```bash +# Using pip pip install mcp-server-datagen ``` -## Configuration -No additional configuration required. The server automatically configures data generation libraries based on schema specifications. +## Tools + +### generate_custom_tables + +Generate synthetic data based on custom schemas and parameters. + +**Input Parameters:** +- `tables`: List of table names to generate +- `rows`: Number of rows to generate (default: 1000) +- `schemas`: Dictionary of table schemas defining columns and their properties + +**Example:** +```json +{ + "tables": ["customers", "policies"], + "rows": 1000, + "schemas": { + "customers": { + "customer_id": { + "type": "integer", + "generator": "numpy", + "min": 1000, + "max": 9999, + "prefix": "CUST-" + }, + "name": { + "type": "string", + "generator": "faker", + "method": "name" + }, + "risk_score": { + "type": "float", + "generator": "numpy", + "min": 0.0, + "max": 1.0 + }, + "age": { + "type": "integer", + "generator": "numpy", + "min": 18, + "max": 90 + }, + "email": { + "type": "string", + "generator": "faker", + "method": "email" + }, + "is_active": { + "type": "boolean", + "generator": "numpy" + } + }, + "policies": { + "policy_id": { + "type": "integer", + "generator": "numpy", + "min": 2000, + "max": 9999, + "prefix": "POL-" + }, + "customer_id": { + "type": "integer", + "correlated": true + }, + "premium": { + "type": "float", + "generator": "numpy", + "min": 500.0, + "max": 5000.0 + }, + "coverage_type": { + "type": "category", + "generator": "numpy", + "categories": ["basic", "standard", "premium"] + }, + "start_date": { + "type": "date", + "generator": "faker", + "method": "date_this_year" + } + } + } +} +``` + +### generate_insurance_data + +Generate synthetic insurance data using predefined schemas for customers, policies, and claims. + +**Input Parameters:** +- `rows`: Number of rows to generate for each table (default: 1000) + +**Example:** +```json +{ + "rows": 1000 +} +``` + +This will generate three related tables: +1. `customers`: Customer information including demographics and contact details +2. `policies`: Insurance policy details with references to customers +3. `claims`: Claim records with references to policies + +**Example Output:** +```json +{ + "tables": { + "customers": { + "customer_id": ["CUST-1001", "CUST-1002"], + "name": ["John Doe", "Jane Smith"], + "age": [45, 32], + "email": ["john.doe@example.com", "jane.smith@example.com"], + "risk_score": [0.75, 0.45], + "is_active": [true, false] + }, + "policies": { + "policy_id": ["POL-2001", "POL-2002"], + "customer_id": ["CUST-1001", "CUST-1002"], + "premium": [1250.50, 980.75], + "coverage_type": ["comprehensive", "basic"], + "start_date": ["2024-01-15", "2024-02-01"] + }, + "claims": { + "claim_id": ["CLM-3001"], + "policy_id": ["POL-2001"], + "amount": [5000.00], + "status": ["pending"], + "date_filed": ["2024-03-10"] + } + } +} +``` + +## Generator Types + +### numpy +- Supports numeric types (integer, float) and boolean values +- Requires min/max parameters for numeric ranges +- Used for generating random numerical data and categorical values +- Automatically handles type conversion and JSON serialization + +**Examples:** +```json +{ + "age": { + "type": "integer", + "generator": "numpy", + "min": 18, + "max": 90 + }, + "risk_score": { + "type": "float", + "generator": "numpy", + "min": 0.0, + "max": 1.0 + }, + "is_active": { + "type": "boolean", + "generator": "numpy" + }, + "status": { + "type": "category", + "generator": "numpy", + "categories": ["active", "pending", "cancelled"] + } +} +``` + +### faker +- Generates realistic personal and business data +- Requires specific method parameter for data type +- Supports various data types including names, emails, dates, and addresses +- Methods are mapped directly to Faker library functions + +**Examples:** +```json +{ + "name": { + "type": "string", + "generator": "faker", + "method": "name" + }, + "email": { + "type": "string", + "generator": "faker", + "method": "email" + }, + "address": { + "type": "string", + "generator": "faker", + "method": "address" + }, + "date_joined": { + "type": "date", + "generator": "faker", + "method": "date_this_year" + } +} +``` + +### mimesis +- Alternative to faker for generating personal data +- Supports hierarchical method paths (e.g., "person.full_name") +- Provides consistent data across different locales + +**Examples:** +```json +{ + "full_name": { + "type": "string", + "generator": "mimesis", + "method": "person.full_name" + }, + "occupation": { + "type": "string", + "generator": "mimesis", + "method": "person.occupation" + }, + "email": { + "type": "string", + "generator": "mimesis", + "method": "person.email" + } +} +``` + +## Supported Data Types + +### string +- Used for text data +- Supports faker and mimesis generators +- Requires method specification +- Optional prefix support for IDs + +### integer +- Whole number values +- Requires min/max range for numpy generator +- Can be used with prefix for ID generation +- Supports correlation between tables + +### float +- Decimal number values +- Requires min/max range for numpy generator +- Automatically handles precision + +### boolean +- True/False values +- Uses numpy generator +- No additional parameters required + +### category +- Enumerated values from a predefined list +- Requires categories parameter +- Uses numpy generator for random selection + +### date +- Date values +- Uses faker generator +- Requires specific method (e.g., "date_this_year") + +## Special Features + +### ID Generation +- Unique ID generation with customizable ranges +- Optional prefix support for readable identifiers +- Automatic correlation between related tables +- Built-in duplicate prevention + +**Examples:** +```json +{ + "customer_id": { + "type": "integer", + "generator": "numpy", + "min": 1000, + "max": 9999, + "prefix": "CUST-" + }, + "policy_id": { + "type": "integer", + "generator": "numpy", + "min": 2000, + "max": 9999, + "prefix": "POL-" + } +} +``` + +### Table Relationships +- Automatic generation order: customers → policies → claims +- Maintains referential integrity across tables +- Correlated IDs ensure valid relationships +- Supports complex relationship chains + +**Example with Relationships:** +```json +{ + "tables": ["customers", "policies", "claims"], + "rows": 1000, + "schemas": { + "customers": { + "customer_id": { + "type": "integer", + "generator": "numpy", + "min": 1000, + "max": 9999, + "prefix": "CUST-" + } + }, + "policies": { + "policy_id": { + "type": "integer", + "generator": "numpy", + "min": 2000, + "max": 9999, + "prefix": "POL-" + }, + "customer_id": { + "type": "integer", + "correlated": true + } + }, + "claims": { + "claim_id": { + "type": "integer", + "generator": "numpy", + "min": 3000, + "max": 9999, + "prefix": "CLM-" + }, + "policy_id": { + "type": "integer", + "correlated": true + } + } + } +} +``` + +### Data Generation Order +1. Parent tables are generated first (e.g., customers) +2. Child tables with correlations follow (e.g., policies referencing customers) +3. Grandchild tables are generated last (e.g., claims referencing policies) + +This order ensures that: +- All referenced IDs exist when needed +- Relationships are valid and consistent +- Data integrity is maintained across the dataset + +### Correlation Rules +- Use `"correlated": true` to reference parent table IDs +- Parent table must be generated before child table +- Column names must follow pattern: `{table_name}_id` +- Automatically handles ID type matching and prefixes ## Development -1. Create virtual environment and install dependencies: +### Setup + +1. Clone the repository +2. Install dependencies: ```bash -uv venv -uv pip install -e ".[dev]" +uv sync --frozen --all-extras --dev ``` -2. Run tests: +3. Run tests: ```bash -uv run pytest tests/unit/ +uv run pytest tests/ ``` -3. Run type checking: +### Type Checking + ```bash -uv run --frozen pyright +uv run pyright ``` -4. Run linting: +### Linting + ```bash uv run ruff check . +uv run ruff format . + +## Troubleshooting + +### Common Issues + +#### Null Values in Generated Data +- **Check Generator Type Compatibility**: Ensure the generator type matches the data type (e.g., use 'faker' for personal data, 'numpy' for numeric) +- **Verify Faker Method Support**: When using faker, confirm the method is supported (see faker types in documentation) +- **Validate Numeric Ranges**: For numeric types, ensure min/max values are valid and min is less than max +- **Boolean Type Generation**: Use 'numpy' generator for boolean types to avoid null values +- **Date Format Specification**: For date fields, use specific faker methods like 'date_this_year' instead of generic 'date' + +#### ID Generation Problems +- **Prefix Format**: + - Ensure prefix is a string (e.g., "POL-", "CUST-") + - Avoid special characters that might cause parsing issues + - Keep prefixes consistent within related tables +- **ID Range Configuration**: + - Set appropriate min/max ranges to avoid collisions + - Ensure range is large enough for requested row count + - Example: For 1000 rows, use range of at least 2000 (min: 1000, max: 3000) +- **Correlated ID Issues**: + - Verify parent table exists and is generated first + - Check parent table has sufficient unique IDs + - Ensure parent table schema includes proper ID field + - Example: For policy.customer_id, customers table must exist with customer_id field + +#### Table Relationship Errors +- **Generation Order**: Tables must be generated in correct order (parent tables first) +- **Schema Consistency**: ID field names must match between parent and child tables +- **Unique Constraints**: Ensure ID ranges don't overlap between tables +- **Correlation Settings**: Set `"correlated": true` for foreign key fields + +#### Type Conversion Errors +- **JSON Serialization**: Some numpy types may need explicit conversion +- **Date Format Issues**: Use ISO format for dates (YYYY-MM-DD) +- **String Conversion**: Ensure prefix concatenation results in valid strings +- **Numeric Precision**: Float values may need rounding for specific use cases + +### Best Practices +1. Start with small row counts to validate schema configuration +2. Use descriptive prefixes for better data readability +3. Implement proper error handling for generated data +4. Validate schema before generating large datasets +5. Monitor memory usage with large row counts +6. Use appropriate generator types for each data category ``` - -## Debugging - -Common issues: -1. Null values in generated data - - Ensure correct type specification in schema - - Verify min/max values are within valid ranges - - Check category lists are non-empty for categorical fields - -2. Type validation errors - - Verify schema types match supported types - - Ensure numeric ranges are appropriate for the type - -## License -MIT License diff --git a/src/datagen/src/mcp_server_datagen/server.py b/src/datagen/src/mcp_server_datagen/server.py index 91298bae..1bc2e3c9 100644 --- a/src/datagen/src/mcp_server_datagen/server.py +++ b/src/datagen/src/mcp_server_datagen/server.py @@ -160,6 +160,62 @@ class DataGenServer: - Numeric data with configurable ranges via NumPy - Related tables with correlated IDs + Generator Types and Examples: + 1. NumPy Generator: + ```json + "age": { + "type": "integer", + "generator": "numpy", + "min": 18, + "max": 65 + } + ``` + + 2. Faker Generator: + ```json + "name": { + "type": "first_name", + "generator": "faker" + } + ``` + + 3. Mimesis Generator: + ```json + "email": { + "type": "email", + "generator": "mimesis" + } + ``` + + Special Features: + 1. ID Prefixes: + ```json + "policy_id": { + "type": "integer", + "generator": "numpy", + "min": 1000, + "max": 9999, + "prefix": "POL-" + } + ``` + + 2. Correlated IDs (Foreign Keys): + ```json + "customer_id": { + "type": "integer", + "correlated": true + } + ``` + + 3. Categories with Prefixes: + ```json + "status_code": { + "type": "string", + "prefix": "STATUS-", + "categories": ["ACTIVE", "PENDING", "CLOSED"] + } + ``` + When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field: - Personal: first_name, last_name, email, phone_number, address - Dates: date_of_birth, date_this_year, date_this_decade @@ -238,16 +294,32 @@ class DataGenServer: description="""Generate insurance-related data tables using default schemas. Generates three tables with realistic insurance data: - - customers: Customer information (names, contact details, etc.) - - policies: Insurance policy details with proper ID prefixes - - claims: Claims data with relationships to policies + 1. Customers Table: + - IDs: Unique integer IDs (10000-99999) + - Personal: Names, email, phone, address (using faker) + - Numeric: Age (18-100), credit score (300-850) + - Status: Boolean active flag - All tables maintain referential integrity and include: - - Customers: IDs, names, contact info, credit scores - - Policies: IDs with prefixes, types, dates, premiums, coverage - - Claims: IDs, dates, types, amounts, status updates + 2. Policies Table: + - IDs: Prefixed IDs (e.g., "POL-2024-123456") + - References: Correlated customer_ids + - Categories: Auto, Home, Life, Health + - Dates: Start dates within current year + - Numeric: Premium ($500-$5000), deductible ($250-$2000) + - Coverage: $50K-$1M range + - Status: Active, Pending, Expired, Cancelled - The data is generated using predefined schemas optimized for insurance scenarios.""", + 3. Claims Table: + - IDs: 6-digit claim numbers + - References: Correlated policy_ids (with prefix) + - Dates: Filing dates within current year + - Amounts: $100-$50,000 range + - Status: Filed, Under Review, Approved, Denied + - Details: Generated claim descriptions + + All tables maintain referential integrity and use appropriate + generators (numpy for numeric, faker for personal data) with + realistic value ranges and categories.""", inputSchema={ "type": "object", "properties": { diff --git a/src/datagen/src/mcp_server_datagen/synthetic.py b/src/datagen/src/mcp_server_datagen/synthetic.py index ea5eac01..ed074102 100644 --- a/src/datagen/src/mcp_server_datagen/synthetic.py +++ b/src/datagen/src/mcp_server_datagen/synthetic.py @@ -1,4 +1,9 @@ -"""Synthetic data generation using numpy and faker.""" +"""Synthetic data generation using numpy and faker. + +This module provides functionality for generating synthetic data using multiple +generator libraries including numpy, faker, and mimesis. It supports various +data types, ID generation with prefixes, and table relationships. +""" from datetime import datetime, date import numpy as np @@ -39,10 +44,32 @@ DEFAULT_CLAIMS_SCHEMA = { class SyntheticDataGenerator: - """Handles synthetic data generation.""" + """Handles synthetic data generation with support for multiple generators. + + Features: + - Multiple generator types (numpy, faker, mimesis) + - Automatic ID generation and correlation + - Prefix support for IDs + - JSON serialization handling + + Generator Types: + - numpy: For numeric and categorical data + - faker: For realistic personal/business data + - mimesis: Alternative to faker + + Special Features: + - prefix: Add prefixes to generated IDs + - correlated: Generate IDs that reference other tables + - type mapping: Automatic conversion between generator types + - JSON serialization: Handles numpy type conversion + """ def __init__(self): - """Initialize the synthetic data generator.""" + """Initialize the synthetic data generator. + + Sets up faker and mimesis instances and initializes tracking for + generated IDs and table relationships. + """ self.faker = Faker() self.mimesis = Generic() # Store generated IDs for relationships @@ -56,7 +83,21 @@ class SyntheticDataGenerator: } def _ensure_json_serializable(self, value: Any) -> Any: - """Convert value to JSON serializable type.""" + """Convert value to JSON serializable type. + + Args: + value: Any value that needs to be JSON serializable + + Returns: + The value converted to a JSON serializable type + + Handles: + - numpy integer types + - numpy float types + - numpy boolean types + - datetime objects + - string conversion + """ if isinstance(value, (np.integer, np.floating)): return value.item() elif isinstance(value, np.bool_): @@ -68,7 +109,37 @@ class SyntheticDataGenerator: return value def _map_type_to_generator(self, col_name: str, spec: Dict[str, Any]) -> Callable[[], Any]: - """Map a column specification to a generator function.""" + """Map a column specification to a generator function. + + Args: + col_name: Name of the column being generated + spec: Column specification dictionary containing type and generator info + + Returns: + A callable that generates values according to the specification + + Generator Types: + - numpy: Numeric and categorical data with min/max ranges + - faker: Realistic personal and business data + - mimesis: Alternative personal data generation + + Special Features: + - Prefix support for string and ID fields + - Correlated ID generation for relationships + - Category-based generation for enums + - Type-specific value ranges + + Example: + >>> spec = { + ... "type": "integer", + ... "generator": "numpy", + ... "min": 1, + ... "max": 100, + ... "prefix": "ID-" + ... } + >>> generator = self._map_type_to_generator("id", spec) + >>> value = generator() # Returns "ID-42" (example) + """ data_type = spec["type"] generator = spec.get("generator", None) @@ -139,7 +210,22 @@ class SyntheticDataGenerator: raise ValueError(f"Unsupported data type: {data_type}") def _map_faker_type(self, data_type: str) -> Any: - """Map a data type to a faker method.""" + """Map a data type to a faker method. + + Args: + data_type: The type of data to generate (e.g., "string", "email") + + Returns: + A callable faker method for generating the specified data type + + Features: + - Supports direct faker method calls (e.g., "faker.name") + - Maps common data types to faker methods + - Fallback to text generation for unsupported types + + Raises: + ValueError: If the specified faker type is unsupported + """ # Handle faker.method format if data_type.startswith("faker."): method = data_type.split(".", 1)[1] @@ -174,7 +260,22 @@ class SyntheticDataGenerator: def _generate_mimesis_value(self, generator: str) -> Any: - """Generate a value using mimesis.""" + """Generate a value using mimesis. + + Args: + generator: String specifying the mimesis generator method + Format: "mimesis.provider.method" or "method" + + Returns: + Generated value from mimesis + + Raises: + ValueError: If the specified generator method is invalid + + Example: + >>> value = self._generate_mimesis_value("mimesis.person.full_name") + >>> print(value) # "John Doe" + """ if "." in generator: methods = generator.split(".") obj = self.mimesis @@ -188,7 +289,30 @@ class SyntheticDataGenerator: return getattr(self.mimesis, generator)() def _generate_unique_id(self, table_name: str, spec: Dict[str, Any]) -> Union[int, str]: - """Generate a unique ID for a table.""" + """Generate a unique ID for a table. + + Args: + table_name: Name of the table requiring the ID + spec: Specification for ID generation including: + - type: "integer" or "string" + - min: Minimum value for range + - max: Maximum value for range + - prefix: Optional prefix for generated IDs + + Returns: + A unique ID (integer or string with prefix) + + Features: + - Ensures uniqueness within table scope + - Supports integer and string ID formats + - Optional prefix support (e.g., "CUST-", "POL-") + - Configurable value ranges + - Built-in duplicate prevention + + Raises: + ValueError: If unable to generate unique ID after max attempts + or if ID type is unsupported + """ id_type = spec.get("type", "integer") min_val = spec.get("min", 1) max_val = spec.get("max", 1000000) @@ -218,7 +342,29 @@ class SyntheticDataGenerator: raise ValueError(f"Failed to generate unique ID for table {table_name} after {max_attempts} attempts") def _generate_correlated_id(self, parent_table: str) -> Union[int, str]: - """Generate a correlated ID from a parent table.""" + """Generate a correlated ID from a parent table. + + Args: + parent_table: Name of the table containing the parent IDs + + Returns: + An ID from the parent table's generated IDs + + Features: + - Maintains referential integrity between tables + - Supports both integer and string ID formats + - Preserves ID format including prefixes + - Random selection from existing parent IDs + - Handles special cases (policies, customers, claims) + + Raises: + ValueError: If no IDs are available in parent table + or if parent table doesn't exist + + Example: + >>> generator._generate_correlated_id("customers") + 'CUST-1234' # Returns existing customer ID + """ if not self._generated_ids.get(parent_table): raise ValueError(f"No IDs available for parent table {parent_table}") @@ -229,7 +375,33 @@ class SyntheticDataGenerator: return np.random.choice(parent_ids) def _extract_parent_table(self, col_name: str) -> str: - """Extract parent table name from column name.""" + """Extract parent table name from column name. + + Args: + col_name: Name of the column (usually ending in '_id') + + Returns: + Name of the parent table (pluralized) + + Features: + - Special case handling: + - policy_id -> policies + - customer_id -> customers + - claim_id -> claims + - General case: + - Removes '_id' suffix + - Adds 's' for pluralization + - Smart pluralization rules + + Example: + >>> generator._extract_parent_table("customer_id") + 'customers' + >>> generator._extract_parent_table("policy_id") + 'policies' + + Raises: + ValueError: If column name format is invalid for extraction + """ # Handle special cases first if col_name == "policy_id": return "policies" @@ -247,7 +419,29 @@ class SyntheticDataGenerator: raise ValueError(f"Invalid column name for parent table extraction: {col_name}") def _ensure_parent_table_ids(self, parent_table: str, rows: int) -> None: - """Ensure parent table has generated IDs.""" + """Ensure parent table has generated IDs. + + Args: + parent_table: Name of the parent table + rows: Number of rows needed for relationships + + Features: + - Automatic schema detection for parent tables + - Smart ID field name resolution: + - Handles singular_id format (customer_id) + - Handles table_id format (customers_id) + - Special cases for policies, customers, claims + - Generates required number of unique IDs + - Maintains ID format consistency + + Raises: + ValueError: If no schema found for parent table + or if no ID field found in schema + + Example: + >>> generator._ensure_parent_table_ids("customers", 1000) + # Generates 1000 unique customer IDs for relationships + """ if not self._generated_ids.get(parent_table): # Get the schema for the parent table if parent_table not in self.default_schemas: @@ -271,7 +465,15 @@ class SyntheticDataGenerator: self._generated_ids[parent_table] = {generator() for _ in range(rows)} # Use set comprehension def _clear_generated_ids(self): - """Clear all generated IDs.""" + """Clear all generated IDs. + + Resets the internal tracking of generated IDs across all tables. + This is useful when: + - Starting a new generation session + - Cleaning up after error conditions + - Resetting state for new table relationships + - Freeing memory after large generations + """ self._generated_ids = {} async def generate_synthetic_data( @@ -280,7 +482,31 @@ class SyntheticDataGenerator: schema: Dict[str, Dict[str, Any]], rows: int ) -> Dict[str, List[Any]]: - """Generate synthetic data for a table.""" + """Generate synthetic data for a table. + + Args: + table_name: Name of the table to generate data for + schema: Schema definition for the table's columns + rows: Number of rows to generate + + Returns: + Dictionary mapping column names to lists of generated values + + Features: + - Two-pass generation for handling relationships + - Automatic ID generation for parent tables + - Correlated ID generation for child tables + - JSON serialization of all values + - Support for all generator types + + Example: + >>> schema = { + ... "id": {"type": "integer", "min": 1, "max": 100}, + ... "name": {"type": "string", "generator": "faker"}, + ... "parent_id": {"type": "integer", "correlated": true} + ... } + >>> data = await generator.generate_synthetic_data("table", schema, 10) + """ data: Dict[str, List[Any]] = {} # If this is a parent table, ensure we generate IDs first