fix: Add faker type mapping and fix string generation

Co-Authored-By: alexander@anthropic.com <alexander@anthropic.com>
This commit is contained in:
Devin AI
2024-12-12 03:06:16 +00:00
parent 87354a84f2
commit e8ca4c0c3b
3 changed files with 102 additions and 15 deletions

View File

@@ -155,10 +155,15 @@ class DataGenServer:
This tool creates multiple tables of synthetic data based on provided schemas. It supports:
- Basic data types (integer, float, string, boolean)
- Categorical data with custom categories
- Realistic personal data (names, emails, addresses) via Faker
- Realistic personal data via Faker (names, emails, addresses)
- Numeric data with configurable ranges via NumPy
- Related tables with correlated IDs
When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field:
- Personal: first_name, last_name, email, phone_number, address
- Dates: date_of_birth, date_this_year, date_this_decade
- Text: text (default for generic strings)
Default schemas are available for common scenarios (customers, policies, claims).""",
inputSchema={
"type": "object",
@@ -189,14 +194,17 @@ class DataGenServer:
"description": """Data type for the column. Valid options:
- Basic: 'string', 'integer'/'int', 'float', 'boolean'
- Categorical: 'category'
- Faker types: 'first_name', 'last_name', 'email', 'phone_number',
'address', 'date_of_birth', 'text', 'date_this_year', 'date_this_decade'"""
- Faker types (use with 'faker' generator): 'first_name', 'last_name', 'email',
'phone_number', 'address', 'date_of_birth', 'text', 'date_this_year',
'date_this_decade'. Generic 'string' type defaults to 'text'."""
},
"generator": {
"type": "string",
"description": """Library to use for generating values. Valid options:
- 'numpy': For numeric and categorical data
- 'faker': For realistic personal/business data
- 'faker': For realistic personal/business data. Must be used with a supported
faker type (see type field). Will generate null values if used with
unsupported types.
- 'mimesis': Alternative to Faker for personal data"""
},
"min": {

View File

@@ -60,12 +60,25 @@ class SyntheticDataGenerator:
}
return type_mapping.get(data_type, "categorical")
def _generate_faker_value(self, generator: str) -> Any:
"""Generate value using Faker."""
if not generator.startswith("faker."):
return None
def _map_faker_type(self, data_type: str) -> str:
"""Map data type to Faker method."""
type_mapping = {
"string": "text", # Default for generic strings
"first_name": "first_name",
"last_name": "last_name",
"email": "email",
"phone_number": "phone_number",
"address": "street_address",
"text": "text",
"date_this_year": "date_this_year",
"date_this_decade": "date_this_decade",
"date_of_birth": "date_of_birth"
}
return type_mapping.get(data_type, "text")
method_name = generator.split(".", 1)[1]
def _generate_faker_value(self, data_type: str) -> Any:
"""Generate value using Faker."""
method_name = self._map_faker_type(data_type)
if hasattr(self.faker, method_name):
return getattr(self.faker, method_name)()
return None
@@ -211,7 +224,7 @@ class SyntheticDataGenerator:
generator_str = col_spec["generator"]
if generator_str.startswith("faker."):
sample_data[col_name] = [
self._generate_faker_value(generator_str)
self._generate_faker_value(generator_str.split(".", 1)[1])
for _ in range(fitting_size)
]
elif generator_str.startswith("mimesis."):
@@ -233,9 +246,14 @@ class SyntheticDataGenerator:
elif col_type == "string":
if "generator" in col_spec:
generator_str = col_spec["generator"]
if generator_str.startswith("faker."):
if generator_str == "faker":
sample_data[col_name] = [
self._generate_faker_value(generator_str)
self._generate_faker_value(col_spec.get("type", "string"))
for _ in range(fitting_size)
]
elif generator_str.startswith("faker."):
sample_data[col_name] = [
self._generate_faker_value(generator_str.split(".", 1)[1])
for _ in range(fitting_size)
]
elif generator_str.startswith("mimesis."):
@@ -295,16 +313,22 @@ class SyntheticDataGenerator:
# Generate correlated ID from parent table
parent_table = self._extract_parent_table(col_name)
value = self._generate_correlated_id(parent_table)
# Handle faker types and string types with faker generator
elif col_spec.get("generator") == "faker":
if col_type in {"first_name", "last_name", "email", "phone_number", "address", "text", "date_this_year", "date_this_decade", "date_of_birth"}:
value = self._generate_faker_value(col_type)
else:
value = self._generate_faker_value("text")
elif col_type == "string":
if "generator" in col_spec:
if col_spec["generator"].startswith("faker."):
value = self._generate_faker_value(col_spec["generator"])
value = self._generate_faker_value(col_spec["generator"].split(".", 1)[1])
elif col_spec["generator"].startswith("mimesis."):
value = self._generate_mimesis_value(col_spec["generator"])
elif "categories" in col_spec:
value = np.random.choice(col_spec["categories"])
else:
value = self._generate_faker_value("faker.word")
value = self._generate_faker_value("text")
elif col_type in ("int", "integer"):
min_val = col_spec.get("min", 0)
max_val = col_spec.get("max", 100)
@@ -315,7 +339,7 @@ class SyntheticDataGenerator:
if "generator" in col_spec:
value = self._generate_faker_value(col_spec["generator"])
else:
value = self._generate_faker_value("faker.date_time_this_decade")
value = self._generate_faker_value("date_time_this_decade")
elif col_type == "category":
value = np.random.choice(col_spec["categories"])

View File

@@ -146,3 +146,58 @@ async def test_integer_type_handling(data_generator):
assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers"
assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range"
assert not any(x is None for x in data["count"]), "Count values should not be null"
@pytest.mark.asyncio
async def test_faker_string_generation(data_generator):
"""Test that faker properly generates string values."""
# Test generic string type with faker generator
schema = {
"customer_name": {
"type": "string",
"generator": "faker"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
assert all(isinstance(x, str) and x is not None for x in data["customer_name"]), "Customer names should be non-null strings"
assert all(len(x) > 0 for x in data["customer_name"]), "Customer names should not be empty"
# Test specific faker types
schema = {
"first_name": {
"type": "first_name",
"generator": "faker"
},
"email": {
"type": "email",
"generator": "faker"
},
"address": {
"type": "address",
"generator": "faker"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
# Verify first_name generation
assert all(isinstance(x, str) and x is not None for x in data["first_name"]), "First names should be non-null strings"
assert all(len(x) > 0 for x in data["first_name"]), "First names should not be empty"
# Verify email generation
assert all(isinstance(x, str) and x is not None for x in data["email"]), "Emails should be non-null strings"
assert all("@" in x for x in data["email"]), "Emails should contain @ symbol"
# Verify address generation
assert all(isinstance(x, str) and x is not None for x in data["address"]), "Addresses should be non-null strings"
assert all(len(x) > 0 for x in data["address"]), "Addresses should not be empty"
# Test legacy faker.method format still works
schema = {
"legacy_name": {
"type": "string",
"generator": "faker.name"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
assert all(isinstance(x, str) and x is not None for x in data["legacy_name"]), "Legacy faker format should still work"
assert all(len(x) > 0 for x in data["legacy_name"]), "Legacy faker names should not be empty"