fix: Add faker type mapping and fix string generation

Co-Authored-By: alexander@anthropic.com <alexander@anthropic.com>
This commit is contained in:
Devin AI
2024-12-12 03:06:16 +00:00
parent 87354a84f2
commit e8ca4c0c3b
3 changed files with 102 additions and 15 deletions

View File

@@ -155,10 +155,15 @@ class DataGenServer:
This tool creates multiple tables of synthetic data based on provided schemas. It supports: This tool creates multiple tables of synthetic data based on provided schemas. It supports:
- Basic data types (integer, float, string, boolean) - Basic data types (integer, float, string, boolean)
- Categorical data with custom categories - Categorical data with custom categories
- Realistic personal data (names, emails, addresses) via Faker - Realistic personal data via Faker (names, emails, addresses)
- Numeric data with configurable ranges via NumPy - Numeric data with configurable ranges via NumPy
- Related tables with correlated IDs - Related tables with correlated IDs
When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field:
- Personal: first_name, last_name, email, phone_number, address
- Dates: date_of_birth, date_this_year, date_this_decade
- Text: text (default for generic strings)
Default schemas are available for common scenarios (customers, policies, claims).""", Default schemas are available for common scenarios (customers, policies, claims).""",
inputSchema={ inputSchema={
"type": "object", "type": "object",
@@ -189,14 +194,17 @@ class DataGenServer:
"description": """Data type for the column. Valid options: "description": """Data type for the column. Valid options:
- Basic: 'string', 'integer'/'int', 'float', 'boolean' - Basic: 'string', 'integer'/'int', 'float', 'boolean'
- Categorical: 'category' - Categorical: 'category'
- Faker types: 'first_name', 'last_name', 'email', 'phone_number', - Faker types (use with 'faker' generator): 'first_name', 'last_name', 'email',
'address', 'date_of_birth', 'text', 'date_this_year', 'date_this_decade'""" 'phone_number', 'address', 'date_of_birth', 'text', 'date_this_year',
'date_this_decade'. Generic 'string' type defaults to 'text'."""
}, },
"generator": { "generator": {
"type": "string", "type": "string",
"description": """Library to use for generating values. Valid options: "description": """Library to use for generating values. Valid options:
- 'numpy': For numeric and categorical data - 'numpy': For numeric and categorical data
- 'faker': For realistic personal/business data - 'faker': For realistic personal/business data. Must be used with a supported
faker type (see type field). Will generate null values if used with
unsupported types.
- 'mimesis': Alternative to Faker for personal data""" - 'mimesis': Alternative to Faker for personal data"""
}, },
"min": { "min": {

View File

@@ -60,12 +60,25 @@ class SyntheticDataGenerator:
} }
return type_mapping.get(data_type, "categorical") return type_mapping.get(data_type, "categorical")
def _generate_faker_value(self, generator: str) -> Any: def _map_faker_type(self, data_type: str) -> str:
"""Generate value using Faker.""" """Map data type to Faker method."""
if not generator.startswith("faker."): type_mapping = {
return None "string": "text", # Default for generic strings
"first_name": "first_name",
"last_name": "last_name",
"email": "email",
"phone_number": "phone_number",
"address": "street_address",
"text": "text",
"date_this_year": "date_this_year",
"date_this_decade": "date_this_decade",
"date_of_birth": "date_of_birth"
}
return type_mapping.get(data_type, "text")
method_name = generator.split(".", 1)[1] def _generate_faker_value(self, data_type: str) -> Any:
"""Generate value using Faker."""
method_name = self._map_faker_type(data_type)
if hasattr(self.faker, method_name): if hasattr(self.faker, method_name):
return getattr(self.faker, method_name)() return getattr(self.faker, method_name)()
return None return None
@@ -211,7 +224,7 @@ class SyntheticDataGenerator:
generator_str = col_spec["generator"] generator_str = col_spec["generator"]
if generator_str.startswith("faker."): if generator_str.startswith("faker."):
sample_data[col_name] = [ sample_data[col_name] = [
self._generate_faker_value(generator_str) self._generate_faker_value(generator_str.split(".", 1)[1])
for _ in range(fitting_size) for _ in range(fitting_size)
] ]
elif generator_str.startswith("mimesis."): elif generator_str.startswith("mimesis."):
@@ -233,9 +246,14 @@ class SyntheticDataGenerator:
elif col_type == "string": elif col_type == "string":
if "generator" in col_spec: if "generator" in col_spec:
generator_str = col_spec["generator"] generator_str = col_spec["generator"]
if generator_str.startswith("faker."): if generator_str == "faker":
sample_data[col_name] = [ sample_data[col_name] = [
self._generate_faker_value(generator_str) self._generate_faker_value(col_spec.get("type", "string"))
for _ in range(fitting_size)
]
elif generator_str.startswith("faker."):
sample_data[col_name] = [
self._generate_faker_value(generator_str.split(".", 1)[1])
for _ in range(fitting_size) for _ in range(fitting_size)
] ]
elif generator_str.startswith("mimesis."): elif generator_str.startswith("mimesis."):
@@ -295,16 +313,22 @@ class SyntheticDataGenerator:
# Generate correlated ID from parent table # Generate correlated ID from parent table
parent_table = self._extract_parent_table(col_name) parent_table = self._extract_parent_table(col_name)
value = self._generate_correlated_id(parent_table) value = self._generate_correlated_id(parent_table)
# Handle faker types and string types with faker generator
elif col_spec.get("generator") == "faker":
if col_type in {"first_name", "last_name", "email", "phone_number", "address", "text", "date_this_year", "date_this_decade", "date_of_birth"}:
value = self._generate_faker_value(col_type)
else:
value = self._generate_faker_value("text")
elif col_type == "string": elif col_type == "string":
if "generator" in col_spec: if "generator" in col_spec:
if col_spec["generator"].startswith("faker."): if col_spec["generator"].startswith("faker."):
value = self._generate_faker_value(col_spec["generator"]) value = self._generate_faker_value(col_spec["generator"].split(".", 1)[1])
elif col_spec["generator"].startswith("mimesis."): elif col_spec["generator"].startswith("mimesis."):
value = self._generate_mimesis_value(col_spec["generator"]) value = self._generate_mimesis_value(col_spec["generator"])
elif "categories" in col_spec: elif "categories" in col_spec:
value = np.random.choice(col_spec["categories"]) value = np.random.choice(col_spec["categories"])
else: else:
value = self._generate_faker_value("faker.word") value = self._generate_faker_value("text")
elif col_type in ("int", "integer"): elif col_type in ("int", "integer"):
min_val = col_spec.get("min", 0) min_val = col_spec.get("min", 0)
max_val = col_spec.get("max", 100) max_val = col_spec.get("max", 100)
@@ -315,7 +339,7 @@ class SyntheticDataGenerator:
if "generator" in col_spec: if "generator" in col_spec:
value = self._generate_faker_value(col_spec["generator"]) value = self._generate_faker_value(col_spec["generator"])
else: else:
value = self._generate_faker_value("faker.date_time_this_decade") value = self._generate_faker_value("date_time_this_decade")
elif col_type == "category": elif col_type == "category":
value = np.random.choice(col_spec["categories"]) value = np.random.choice(col_spec["categories"])

View File

@@ -146,3 +146,58 @@ async def test_integer_type_handling(data_generator):
assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers" assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers"
assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range" assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range"
assert not any(x is None for x in data["count"]), "Count values should not be null" assert not any(x is None for x in data["count"]), "Count values should not be null"
@pytest.mark.asyncio
async def test_faker_string_generation(data_generator):
"""Test that faker properly generates string values."""
# Test generic string type with faker generator
schema = {
"customer_name": {
"type": "string",
"generator": "faker"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
assert all(isinstance(x, str) and x is not None for x in data["customer_name"]), "Customer names should be non-null strings"
assert all(len(x) > 0 for x in data["customer_name"]), "Customer names should not be empty"
# Test specific faker types
schema = {
"first_name": {
"type": "first_name",
"generator": "faker"
},
"email": {
"type": "email",
"generator": "faker"
},
"address": {
"type": "address",
"generator": "faker"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
# Verify first_name generation
assert all(isinstance(x, str) and x is not None for x in data["first_name"]), "First names should be non-null strings"
assert all(len(x) > 0 for x in data["first_name"]), "First names should not be empty"
# Verify email generation
assert all(isinstance(x, str) and x is not None for x in data["email"]), "Emails should be non-null strings"
assert all("@" in x for x in data["email"]), "Emails should contain @ symbol"
# Verify address generation
assert all(isinstance(x, str) and x is not None for x in data["address"]), "Addresses should be non-null strings"
assert all(len(x) > 0 for x in data["address"]), "Addresses should not be empty"
# Test legacy faker.method format still works
schema = {
"legacy_name": {
"type": "string",
"generator": "faker.name"
}
}
data = await data_generator.generate_synthetic_data("test", schema, 10)
assert all(isinstance(x, str) and x is not None for x in data["legacy_name"]), "Legacy faker format should still work"
assert all(len(x) > 0 for x in data["legacy_name"]), "Legacy faker names should not be empty"