mirror of
https://github.com/modelcontextprotocol/servers.git
synced 2026-04-26 15:55:39 +02:00
fix: Add faker type mapping and fix string generation
Co-Authored-By: alexander@anthropic.com <alexander@anthropic.com>
This commit is contained in:
@@ -155,10 +155,15 @@ class DataGenServer:
|
||||
This tool creates multiple tables of synthetic data based on provided schemas. It supports:
|
||||
- Basic data types (integer, float, string, boolean)
|
||||
- Categorical data with custom categories
|
||||
- Realistic personal data (names, emails, addresses) via Faker
|
||||
- Realistic personal data via Faker (names, emails, addresses)
|
||||
- Numeric data with configurable ranges via NumPy
|
||||
- Related tables with correlated IDs
|
||||
|
||||
When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field:
|
||||
- Personal: first_name, last_name, email, phone_number, address
|
||||
- Dates: date_of_birth, date_this_year, date_this_decade
|
||||
- Text: text (default for generic strings)
|
||||
|
||||
Default schemas are available for common scenarios (customers, policies, claims).""",
|
||||
inputSchema={
|
||||
"type": "object",
|
||||
@@ -189,14 +194,17 @@ class DataGenServer:
|
||||
"description": """Data type for the column. Valid options:
|
||||
- Basic: 'string', 'integer'/'int', 'float', 'boolean'
|
||||
- Categorical: 'category'
|
||||
- Faker types: 'first_name', 'last_name', 'email', 'phone_number',
|
||||
'address', 'date_of_birth', 'text', 'date_this_year', 'date_this_decade'"""
|
||||
- Faker types (use with 'faker' generator): 'first_name', 'last_name', 'email',
|
||||
'phone_number', 'address', 'date_of_birth', 'text', 'date_this_year',
|
||||
'date_this_decade'. Generic 'string' type defaults to 'text'."""
|
||||
},
|
||||
"generator": {
|
||||
"type": "string",
|
||||
"description": """Library to use for generating values. Valid options:
|
||||
- 'numpy': For numeric and categorical data
|
||||
- 'faker': For realistic personal/business data
|
||||
- 'faker': For realistic personal/business data. Must be used with a supported
|
||||
faker type (see type field). Will generate null values if used with
|
||||
unsupported types.
|
||||
- 'mimesis': Alternative to Faker for personal data"""
|
||||
},
|
||||
"min": {
|
||||
|
||||
@@ -60,12 +60,25 @@ class SyntheticDataGenerator:
|
||||
}
|
||||
return type_mapping.get(data_type, "categorical")
|
||||
|
||||
def _generate_faker_value(self, generator: str) -> Any:
|
||||
"""Generate value using Faker."""
|
||||
if not generator.startswith("faker."):
|
||||
return None
|
||||
def _map_faker_type(self, data_type: str) -> str:
|
||||
"""Map data type to Faker method."""
|
||||
type_mapping = {
|
||||
"string": "text", # Default for generic strings
|
||||
"first_name": "first_name",
|
||||
"last_name": "last_name",
|
||||
"email": "email",
|
||||
"phone_number": "phone_number",
|
||||
"address": "street_address",
|
||||
"text": "text",
|
||||
"date_this_year": "date_this_year",
|
||||
"date_this_decade": "date_this_decade",
|
||||
"date_of_birth": "date_of_birth"
|
||||
}
|
||||
return type_mapping.get(data_type, "text")
|
||||
|
||||
method_name = generator.split(".", 1)[1]
|
||||
def _generate_faker_value(self, data_type: str) -> Any:
|
||||
"""Generate value using Faker."""
|
||||
method_name = self._map_faker_type(data_type)
|
||||
if hasattr(self.faker, method_name):
|
||||
return getattr(self.faker, method_name)()
|
||||
return None
|
||||
@@ -211,7 +224,7 @@ class SyntheticDataGenerator:
|
||||
generator_str = col_spec["generator"]
|
||||
if generator_str.startswith("faker."):
|
||||
sample_data[col_name] = [
|
||||
self._generate_faker_value(generator_str)
|
||||
self._generate_faker_value(generator_str.split(".", 1)[1])
|
||||
for _ in range(fitting_size)
|
||||
]
|
||||
elif generator_str.startswith("mimesis."):
|
||||
@@ -233,9 +246,14 @@ class SyntheticDataGenerator:
|
||||
elif col_type == "string":
|
||||
if "generator" in col_spec:
|
||||
generator_str = col_spec["generator"]
|
||||
if generator_str.startswith("faker."):
|
||||
if generator_str == "faker":
|
||||
sample_data[col_name] = [
|
||||
self._generate_faker_value(generator_str)
|
||||
self._generate_faker_value(col_spec.get("type", "string"))
|
||||
for _ in range(fitting_size)
|
||||
]
|
||||
elif generator_str.startswith("faker."):
|
||||
sample_data[col_name] = [
|
||||
self._generate_faker_value(generator_str.split(".", 1)[1])
|
||||
for _ in range(fitting_size)
|
||||
]
|
||||
elif generator_str.startswith("mimesis."):
|
||||
@@ -295,16 +313,22 @@ class SyntheticDataGenerator:
|
||||
# Generate correlated ID from parent table
|
||||
parent_table = self._extract_parent_table(col_name)
|
||||
value = self._generate_correlated_id(parent_table)
|
||||
# Handle faker types and string types with faker generator
|
||||
elif col_spec.get("generator") == "faker":
|
||||
if col_type in {"first_name", "last_name", "email", "phone_number", "address", "text", "date_this_year", "date_this_decade", "date_of_birth"}:
|
||||
value = self._generate_faker_value(col_type)
|
||||
else:
|
||||
value = self._generate_faker_value("text")
|
||||
elif col_type == "string":
|
||||
if "generator" in col_spec:
|
||||
if col_spec["generator"].startswith("faker."):
|
||||
value = self._generate_faker_value(col_spec["generator"])
|
||||
value = self._generate_faker_value(col_spec["generator"].split(".", 1)[1])
|
||||
elif col_spec["generator"].startswith("mimesis."):
|
||||
value = self._generate_mimesis_value(col_spec["generator"])
|
||||
elif "categories" in col_spec:
|
||||
value = np.random.choice(col_spec["categories"])
|
||||
else:
|
||||
value = self._generate_faker_value("faker.word")
|
||||
value = self._generate_faker_value("text")
|
||||
elif col_type in ("int", "integer"):
|
||||
min_val = col_spec.get("min", 0)
|
||||
max_val = col_spec.get("max", 100)
|
||||
@@ -315,7 +339,7 @@ class SyntheticDataGenerator:
|
||||
if "generator" in col_spec:
|
||||
value = self._generate_faker_value(col_spec["generator"])
|
||||
else:
|
||||
value = self._generate_faker_value("faker.date_time_this_decade")
|
||||
value = self._generate_faker_value("date_time_this_decade")
|
||||
elif col_type == "category":
|
||||
value = np.random.choice(col_spec["categories"])
|
||||
|
||||
|
||||
@@ -146,3 +146,58 @@ async def test_integer_type_handling(data_generator):
|
||||
assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers"
|
||||
assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range"
|
||||
assert not any(x is None for x in data["count"]), "Count values should not be null"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_faker_string_generation(data_generator):
|
||||
"""Test that faker properly generates string values."""
|
||||
# Test generic string type with faker generator
|
||||
schema = {
|
||||
"customer_name": {
|
||||
"type": "string",
|
||||
"generator": "faker"
|
||||
}
|
||||
}
|
||||
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||
assert all(isinstance(x, str) and x is not None for x in data["customer_name"]), "Customer names should be non-null strings"
|
||||
assert all(len(x) > 0 for x in data["customer_name"]), "Customer names should not be empty"
|
||||
|
||||
# Test specific faker types
|
||||
schema = {
|
||||
"first_name": {
|
||||
"type": "first_name",
|
||||
"generator": "faker"
|
||||
},
|
||||
"email": {
|
||||
"type": "email",
|
||||
"generator": "faker"
|
||||
},
|
||||
"address": {
|
||||
"type": "address",
|
||||
"generator": "faker"
|
||||
}
|
||||
}
|
||||
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||
|
||||
# Verify first_name generation
|
||||
assert all(isinstance(x, str) and x is not None for x in data["first_name"]), "First names should be non-null strings"
|
||||
assert all(len(x) > 0 for x in data["first_name"]), "First names should not be empty"
|
||||
|
||||
# Verify email generation
|
||||
assert all(isinstance(x, str) and x is not None for x in data["email"]), "Emails should be non-null strings"
|
||||
assert all("@" in x for x in data["email"]), "Emails should contain @ symbol"
|
||||
|
||||
# Verify address generation
|
||||
assert all(isinstance(x, str) and x is not None for x in data["address"]), "Addresses should be non-null strings"
|
||||
assert all(len(x) > 0 for x in data["address"]), "Addresses should not be empty"
|
||||
|
||||
# Test legacy faker.method format still works
|
||||
schema = {
|
||||
"legacy_name": {
|
||||
"type": "string",
|
||||
"generator": "faker.name"
|
||||
}
|
||||
}
|
||||
data = await data_generator.generate_synthetic_data("test", schema, 10)
|
||||
assert all(isinstance(x, str) and x is not None for x in data["legacy_name"]), "Legacy faker format should still work"
|
||||
assert all(len(x) > 0 for x in data["legacy_name"]), "Legacy faker names should not be empty"
|
||||
|
||||
Reference in New Issue
Block a user