mirror of
https://github.com/modelcontextprotocol/servers.git
synced 2026-04-27 00:05:18 +02:00
fix: Add faker type mapping and fix string generation
Co-Authored-By: alexander@anthropic.com <alexander@anthropic.com>
This commit is contained in:
@@ -155,10 +155,15 @@ class DataGenServer:
|
|||||||
This tool creates multiple tables of synthetic data based on provided schemas. It supports:
|
This tool creates multiple tables of synthetic data based on provided schemas. It supports:
|
||||||
- Basic data types (integer, float, string, boolean)
|
- Basic data types (integer, float, string, boolean)
|
||||||
- Categorical data with custom categories
|
- Categorical data with custom categories
|
||||||
- Realistic personal data (names, emails, addresses) via Faker
|
- Realistic personal data via Faker (names, emails, addresses)
|
||||||
- Numeric data with configurable ranges via NumPy
|
- Numeric data with configurable ranges via NumPy
|
||||||
- Related tables with correlated IDs
|
- Related tables with correlated IDs
|
||||||
|
|
||||||
|
When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field:
|
||||||
|
- Personal: first_name, last_name, email, phone_number, address
|
||||||
|
- Dates: date_of_birth, date_this_year, date_this_decade
|
||||||
|
- Text: text (default for generic strings)
|
||||||
|
|
||||||
Default schemas are available for common scenarios (customers, policies, claims).""",
|
Default schemas are available for common scenarios (customers, policies, claims).""",
|
||||||
inputSchema={
|
inputSchema={
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@@ -189,14 +194,17 @@ class DataGenServer:
|
|||||||
"description": """Data type for the column. Valid options:
|
"description": """Data type for the column. Valid options:
|
||||||
- Basic: 'string', 'integer'/'int', 'float', 'boolean'
|
- Basic: 'string', 'integer'/'int', 'float', 'boolean'
|
||||||
- Categorical: 'category'
|
- Categorical: 'category'
|
||||||
- Faker types: 'first_name', 'last_name', 'email', 'phone_number',
|
- Faker types (use with 'faker' generator): 'first_name', 'last_name', 'email',
|
||||||
'address', 'date_of_birth', 'text', 'date_this_year', 'date_this_decade'"""
|
'phone_number', 'address', 'date_of_birth', 'text', 'date_this_year',
|
||||||
|
'date_this_decade'. Generic 'string' type defaults to 'text'."""
|
||||||
},
|
},
|
||||||
"generator": {
|
"generator": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": """Library to use for generating values. Valid options:
|
"description": """Library to use for generating values. Valid options:
|
||||||
- 'numpy': For numeric and categorical data
|
- 'numpy': For numeric and categorical data
|
||||||
- 'faker': For realistic personal/business data
|
- 'faker': For realistic personal/business data. Must be used with a supported
|
||||||
|
faker type (see type field). Will generate null values if used with
|
||||||
|
unsupported types.
|
||||||
- 'mimesis': Alternative to Faker for personal data"""
|
- 'mimesis': Alternative to Faker for personal data"""
|
||||||
},
|
},
|
||||||
"min": {
|
"min": {
|
||||||
|
|||||||
@@ -60,12 +60,25 @@ class SyntheticDataGenerator:
|
|||||||
}
|
}
|
||||||
return type_mapping.get(data_type, "categorical")
|
return type_mapping.get(data_type, "categorical")
|
||||||
|
|
||||||
def _generate_faker_value(self, generator: str) -> Any:
|
def _map_faker_type(self, data_type: str) -> str:
|
||||||
"""Generate value using Faker."""
|
"""Map data type to Faker method."""
|
||||||
if not generator.startswith("faker."):
|
type_mapping = {
|
||||||
return None
|
"string": "text", # Default for generic strings
|
||||||
|
"first_name": "first_name",
|
||||||
|
"last_name": "last_name",
|
||||||
|
"email": "email",
|
||||||
|
"phone_number": "phone_number",
|
||||||
|
"address": "street_address",
|
||||||
|
"text": "text",
|
||||||
|
"date_this_year": "date_this_year",
|
||||||
|
"date_this_decade": "date_this_decade",
|
||||||
|
"date_of_birth": "date_of_birth"
|
||||||
|
}
|
||||||
|
return type_mapping.get(data_type, "text")
|
||||||
|
|
||||||
method_name = generator.split(".", 1)[1]
|
def _generate_faker_value(self, data_type: str) -> Any:
|
||||||
|
"""Generate value using Faker."""
|
||||||
|
method_name = self._map_faker_type(data_type)
|
||||||
if hasattr(self.faker, method_name):
|
if hasattr(self.faker, method_name):
|
||||||
return getattr(self.faker, method_name)()
|
return getattr(self.faker, method_name)()
|
||||||
return None
|
return None
|
||||||
@@ -211,7 +224,7 @@ class SyntheticDataGenerator:
|
|||||||
generator_str = col_spec["generator"]
|
generator_str = col_spec["generator"]
|
||||||
if generator_str.startswith("faker."):
|
if generator_str.startswith("faker."):
|
||||||
sample_data[col_name] = [
|
sample_data[col_name] = [
|
||||||
self._generate_faker_value(generator_str)
|
self._generate_faker_value(generator_str.split(".", 1)[1])
|
||||||
for _ in range(fitting_size)
|
for _ in range(fitting_size)
|
||||||
]
|
]
|
||||||
elif generator_str.startswith("mimesis."):
|
elif generator_str.startswith("mimesis."):
|
||||||
@@ -233,9 +246,14 @@ class SyntheticDataGenerator:
|
|||||||
elif col_type == "string":
|
elif col_type == "string":
|
||||||
if "generator" in col_spec:
|
if "generator" in col_spec:
|
||||||
generator_str = col_spec["generator"]
|
generator_str = col_spec["generator"]
|
||||||
if generator_str.startswith("faker."):
|
if generator_str == "faker":
|
||||||
sample_data[col_name] = [
|
sample_data[col_name] = [
|
||||||
self._generate_faker_value(generator_str)
|
self._generate_faker_value(col_spec.get("type", "string"))
|
||||||
|
for _ in range(fitting_size)
|
||||||
|
]
|
||||||
|
elif generator_str.startswith("faker."):
|
||||||
|
sample_data[col_name] = [
|
||||||
|
self._generate_faker_value(generator_str.split(".", 1)[1])
|
||||||
for _ in range(fitting_size)
|
for _ in range(fitting_size)
|
||||||
]
|
]
|
||||||
elif generator_str.startswith("mimesis."):
|
elif generator_str.startswith("mimesis."):
|
||||||
@@ -295,16 +313,22 @@ class SyntheticDataGenerator:
|
|||||||
# Generate correlated ID from parent table
|
# Generate correlated ID from parent table
|
||||||
parent_table = self._extract_parent_table(col_name)
|
parent_table = self._extract_parent_table(col_name)
|
||||||
value = self._generate_correlated_id(parent_table)
|
value = self._generate_correlated_id(parent_table)
|
||||||
|
# Handle faker types and string types with faker generator
|
||||||
|
elif col_spec.get("generator") == "faker":
|
||||||
|
if col_type in {"first_name", "last_name", "email", "phone_number", "address", "text", "date_this_year", "date_this_decade", "date_of_birth"}:
|
||||||
|
value = self._generate_faker_value(col_type)
|
||||||
|
else:
|
||||||
|
value = self._generate_faker_value("text")
|
||||||
elif col_type == "string":
|
elif col_type == "string":
|
||||||
if "generator" in col_spec:
|
if "generator" in col_spec:
|
||||||
if col_spec["generator"].startswith("faker."):
|
if col_spec["generator"].startswith("faker."):
|
||||||
value = self._generate_faker_value(col_spec["generator"])
|
value = self._generate_faker_value(col_spec["generator"].split(".", 1)[1])
|
||||||
elif col_spec["generator"].startswith("mimesis."):
|
elif col_spec["generator"].startswith("mimesis."):
|
||||||
value = self._generate_mimesis_value(col_spec["generator"])
|
value = self._generate_mimesis_value(col_spec["generator"])
|
||||||
elif "categories" in col_spec:
|
elif "categories" in col_spec:
|
||||||
value = np.random.choice(col_spec["categories"])
|
value = np.random.choice(col_spec["categories"])
|
||||||
else:
|
else:
|
||||||
value = self._generate_faker_value("faker.word")
|
value = self._generate_faker_value("text")
|
||||||
elif col_type in ("int", "integer"):
|
elif col_type in ("int", "integer"):
|
||||||
min_val = col_spec.get("min", 0)
|
min_val = col_spec.get("min", 0)
|
||||||
max_val = col_spec.get("max", 100)
|
max_val = col_spec.get("max", 100)
|
||||||
@@ -315,7 +339,7 @@ class SyntheticDataGenerator:
|
|||||||
if "generator" in col_spec:
|
if "generator" in col_spec:
|
||||||
value = self._generate_faker_value(col_spec["generator"])
|
value = self._generate_faker_value(col_spec["generator"])
|
||||||
else:
|
else:
|
||||||
value = self._generate_faker_value("faker.date_time_this_decade")
|
value = self._generate_faker_value("date_time_this_decade")
|
||||||
elif col_type == "category":
|
elif col_type == "category":
|
||||||
value = np.random.choice(col_spec["categories"])
|
value = np.random.choice(col_spec["categories"])
|
||||||
|
|
||||||
|
|||||||
@@ -146,3 +146,58 @@ async def test_integer_type_handling(data_generator):
|
|||||||
assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers"
|
assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers"
|
||||||
assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range"
|
assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range"
|
||||||
assert not any(x is None for x in data["count"]), "Count values should not be null"
|
assert not any(x is None for x in data["count"]), "Count values should not be null"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_faker_string_generation(data_generator):
|
||||||
|
"""Test that faker properly generates string values."""
|
||||||
|
# Test generic string type with faker generator
|
||||||
|
schema = {
|
||||||
|
"customer_name": {
|
||||||
|
"type": "string",
|
||||||
|
"generator": "faker"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||||
|
assert all(isinstance(x, str) and x is not None for x in data["customer_name"]), "Customer names should be non-null strings"
|
||||||
|
assert all(len(x) > 0 for x in data["customer_name"]), "Customer names should not be empty"
|
||||||
|
|
||||||
|
# Test specific faker types
|
||||||
|
schema = {
|
||||||
|
"first_name": {
|
||||||
|
"type": "first_name",
|
||||||
|
"generator": "faker"
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"type": "email",
|
||||||
|
"generator": "faker"
|
||||||
|
},
|
||||||
|
"address": {
|
||||||
|
"type": "address",
|
||||||
|
"generator": "faker"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||||
|
|
||||||
|
# Verify first_name generation
|
||||||
|
assert all(isinstance(x, str) and x is not None for x in data["first_name"]), "First names should be non-null strings"
|
||||||
|
assert all(len(x) > 0 for x in data["first_name"]), "First names should not be empty"
|
||||||
|
|
||||||
|
# Verify email generation
|
||||||
|
assert all(isinstance(x, str) and x is not None for x in data["email"]), "Emails should be non-null strings"
|
||||||
|
assert all("@" in x for x in data["email"]), "Emails should contain @ symbol"
|
||||||
|
|
||||||
|
# Verify address generation
|
||||||
|
assert all(isinstance(x, str) and x is not None for x in data["address"]), "Addresses should be non-null strings"
|
||||||
|
assert all(len(x) > 0 for x in data["address"]), "Addresses should not be empty"
|
||||||
|
|
||||||
|
# Test legacy faker.method format still works
|
||||||
|
schema = {
|
||||||
|
"legacy_name": {
|
||||||
|
"type": "string",
|
||||||
|
"generator": "faker.name"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||||
|
assert all(isinstance(x, str) and x is not None for x in data["legacy_name"]), "Legacy faker format should still work"
|
||||||
|
assert all(len(x) > 0 for x in data["legacy_name"]), "Legacy faker names should not be empty"
|
||||||
|
|||||||
Reference in New Issue
Block a user