diff --git a/src/datagen/src/mcp_server_datagen/server.py b/src/datagen/src/mcp_server_datagen/server.py index 1caff3a1..ef4a8aab 100644 --- a/src/datagen/src/mcp_server_datagen/server.py +++ b/src/datagen/src/mcp_server_datagen/server.py @@ -155,10 +155,15 @@ class DataGenServer: This tool creates multiple tables of synthetic data based on provided schemas. It supports: - Basic data types (integer, float, string, boolean) - Categorical data with custom categories - - Realistic personal data (names, emails, addresses) via Faker + - Realistic personal data via Faker (names, emails, addresses) - Numeric data with configurable ranges via NumPy - Related tables with correlated IDs + When using the 'faker' generator, you must specify one of the supported faker types in the 'type' field: + - Personal: first_name, last_name, email, phone_number, address + - Dates: date_of_birth, date_this_year, date_this_decade + - Text: text (default for generic strings) + Default schemas are available for common scenarios (customers, policies, claims).""", inputSchema={ "type": "object", @@ -189,14 +194,17 @@ class DataGenServer: "description": """Data type for the column. Valid options: - Basic: 'string', 'integer'/'int', 'float', 'boolean' - Categorical: 'category' - - Faker types: 'first_name', 'last_name', 'email', 'phone_number', - 'address', 'date_of_birth', 'text', 'date_this_year', 'date_this_decade'""" + - Faker types (use with 'faker' generator): 'first_name', 'last_name', 'email', + 'phone_number', 'address', 'date_of_birth', 'text', 'date_this_year', + 'date_this_decade'. Generic 'string' type defaults to 'text'.""" }, "generator": { "type": "string", "description": """Library to use for generating values. Valid options: - 'numpy': For numeric and categorical data - - 'faker': For realistic personal/business data + - 'faker': For realistic personal/business data. Must be used with a supported + faker type (see type field). Will generate null values if used with + unsupported types. - 'mimesis': Alternative to Faker for personal data""" }, "min": { diff --git a/src/datagen/src/mcp_server_datagen/synthetic.py b/src/datagen/src/mcp_server_datagen/synthetic.py index 6bc14261..3604c026 100644 --- a/src/datagen/src/mcp_server_datagen/synthetic.py +++ b/src/datagen/src/mcp_server_datagen/synthetic.py @@ -60,12 +60,25 @@ class SyntheticDataGenerator: } return type_mapping.get(data_type, "categorical") - def _generate_faker_value(self, generator: str) -> Any: - """Generate value using Faker.""" - if not generator.startswith("faker."): - return None + def _map_faker_type(self, data_type: str) -> str: + """Map data type to Faker method.""" + type_mapping = { + "string": "text", # Default for generic strings + "first_name": "first_name", + "last_name": "last_name", + "email": "email", + "phone_number": "phone_number", + "address": "street_address", + "text": "text", + "date_this_year": "date_this_year", + "date_this_decade": "date_this_decade", + "date_of_birth": "date_of_birth" + } + return type_mapping.get(data_type, "text") - method_name = generator.split(".", 1)[1] + def _generate_faker_value(self, data_type: str) -> Any: + """Generate value using Faker.""" + method_name = self._map_faker_type(data_type) if hasattr(self.faker, method_name): return getattr(self.faker, method_name)() return None @@ -211,7 +224,7 @@ class SyntheticDataGenerator: generator_str = col_spec["generator"] if generator_str.startswith("faker."): sample_data[col_name] = [ - self._generate_faker_value(generator_str) + self._generate_faker_value(generator_str.split(".", 1)[1]) for _ in range(fitting_size) ] elif generator_str.startswith("mimesis."): @@ -233,9 +246,14 @@ class SyntheticDataGenerator: elif col_type == "string": if "generator" in col_spec: generator_str = col_spec["generator"] - if generator_str.startswith("faker."): + if generator_str == "faker": sample_data[col_name] = [ - self._generate_faker_value(generator_str) + self._generate_faker_value(col_spec.get("type", "string")) + for _ in range(fitting_size) + ] + elif generator_str.startswith("faker."): + sample_data[col_name] = [ + self._generate_faker_value(generator_str.split(".", 1)[1]) for _ in range(fitting_size) ] elif generator_str.startswith("mimesis."): @@ -295,16 +313,22 @@ class SyntheticDataGenerator: # Generate correlated ID from parent table parent_table = self._extract_parent_table(col_name) value = self._generate_correlated_id(parent_table) + # Handle faker types and string types with faker generator + elif col_spec.get("generator") == "faker": + if col_type in {"first_name", "last_name", "email", "phone_number", "address", "text", "date_this_year", "date_this_decade", "date_of_birth"}: + value = self._generate_faker_value(col_type) + else: + value = self._generate_faker_value("text") elif col_type == "string": if "generator" in col_spec: if col_spec["generator"].startswith("faker."): - value = self._generate_faker_value(col_spec["generator"]) + value = self._generate_faker_value(col_spec["generator"].split(".", 1)[1]) elif col_spec["generator"].startswith("mimesis."): value = self._generate_mimesis_value(col_spec["generator"]) elif "categories" in col_spec: value = np.random.choice(col_spec["categories"]) else: - value = self._generate_faker_value("faker.word") + value = self._generate_faker_value("text") elif col_type in ("int", "integer"): min_val = col_spec.get("min", 0) max_val = col_spec.get("max", 100) @@ -315,7 +339,7 @@ class SyntheticDataGenerator: if "generator" in col_spec: value = self._generate_faker_value(col_spec["generator"]) else: - value = self._generate_faker_value("faker.date_time_this_decade") + value = self._generate_faker_value("date_time_this_decade") elif col_type == "category": value = np.random.choice(col_spec["categories"]) diff --git a/src/datagen/tests/unit/test_synthetic.py b/src/datagen/tests/unit/test_synthetic.py index 5835db3a..f892ec4e 100644 --- a/src/datagen/tests/unit/test_synthetic.py +++ b/src/datagen/tests/unit/test_synthetic.py @@ -146,3 +146,58 @@ async def test_integer_type_handling(data_generator): assert all(isinstance(x, (int, np.integer)) for x in data["count"]), "Count values should be integers" assert all(1 <= x <= 10 for x in data["count"]), "Count values should be within range" assert not any(x is None for x in data["count"]), "Count values should not be null" + + +@pytest.mark.asyncio +async def test_faker_string_generation(data_generator): + """Test that faker properly generates string values.""" + # Test generic string type with faker generator + schema = { + "customer_name": { + "type": "string", + "generator": "faker" + } + } + data = await data_generator.generate_synthetic_data("test", schema, 10) + assert all(isinstance(x, str) and x is not None for x in data["customer_name"]), "Customer names should be non-null strings" + assert all(len(x) > 0 for x in data["customer_name"]), "Customer names should not be empty" + + # Test specific faker types + schema = { + "first_name": { + "type": "first_name", + "generator": "faker" + }, + "email": { + "type": "email", + "generator": "faker" + }, + "address": { + "type": "address", + "generator": "faker" + } + } + data = await data_generator.generate_synthetic_data("test", schema, 10) + + # Verify first_name generation + assert all(isinstance(x, str) and x is not None for x in data["first_name"]), "First names should be non-null strings" + assert all(len(x) > 0 for x in data["first_name"]), "First names should not be empty" + + # Verify email generation + assert all(isinstance(x, str) and x is not None for x in data["email"]), "Emails should be non-null strings" + assert all("@" in x for x in data["email"]), "Emails should contain @ symbol" + + # Verify address generation + assert all(isinstance(x, str) and x is not None for x in data["address"]), "Addresses should be non-null strings" + assert all(len(x) > 0 for x in data["address"]), "Addresses should not be empty" + + # Test legacy faker.method format still works + schema = { + "legacy_name": { + "type": "string", + "generator": "faker.name" + } + } + data = await data_generator.generate_synthetic_data("test", schema, 10) + assert all(isinstance(x, str) and x is not None for x in data["legacy_name"]), "Legacy faker format should still work" + assert all(len(x) > 0 for x in data["legacy_name"]), "Legacy faker names should not be empty"