跳转至

ScientificRegressionTask(科学符号回归)

evotoolkit.task.python_task.scientific_regression.ScientificRegressionTask

Bases: PythonTask

Scientific symbolic regression task for discovering mathematical equations.

This task evaluates Python code that defines an equation function, optimizes its parameters using scipy, and returns fitness based on MSE.

Source code in src/evotoolkit/task/python_task/scientific_regression/scientific_regression_task.py
@register_task("ScientificRegression")
class ScientificRegressionTask(PythonTask):
    """
    Scientific symbolic regression task for discovering mathematical equations.

    This task evaluates Python code that defines an `equation` function,
    optimizes its parameters using scipy, and returns fitness based on MSE.
    """

    def __init__(
        self,
        dataset_name: Literal["bactgrow", "oscillator1", "oscillator2", "stressstrain"],
        data_dir: str | Path | None = None,
        max_params: int = 10,
        timeout_seconds: float = 60.0,
    ):
        """
        Initialize scientific regression task.

        Args:
            dataset_name: Name of the scientific dataset
            data_dir: Custom data directory (optional, defaults to ~/.evotool/data/)
            max_params: Maximum number of optimizable parameters
            timeout_seconds: Execution timeout
        """
        if dataset_name not in DATASET_INFO:
            raise ValueError(
                f"Unknown dataset: {dataset_name}. "
                f"Available: {list(DATASET_INFO.keys())}"
            )

        self.dataset_name = dataset_name
        self.max_params = max_params
        self.dataset_info = DATASET_INFO[dataset_name]

        # Load data
        train_data, test_data = self._load_dataset(dataset_name, data_dir)

        # Store data
        self.train_inputs = train_data["inputs"]
        self.train_outputs = train_data["outputs"]
        self.test_inputs = test_data["inputs"]
        self.test_outputs = test_data["outputs"]

        # Pass to parent
        super().__init__(
            data={"train": train_data, "test": test_data},
            timeout_seconds=timeout_seconds,
        )

    def _load_dataset(self, dataset_name: str, data_dir: str | Path | None):
        """
        Load dataset, automatically downloading from GitHub release if needed.

        Returns:
            tuple: (train_data, test_data) dictionaries with 'inputs' and 'outputs'
        """
        from evotoolkit.data import DownloadError, get_dataset_path

        try:
            # Get dataset path, will auto-download if needed
            base_dir = get_dataset_path(
                "scientific_regression", data_dir=data_dir)
            dataset_path = base_dir / dataset_name
        except DownloadError as e:
            raise FileNotFoundError(
                f"Failed to download dataset '{dataset_name}': {str(e)}"
            ) from e

        # Verify dataset exists
        if not dataset_path.exists():
            raise FileNotFoundError(
                f"Dataset '{dataset_name}' not found after download. "
                f"This might be a bug - please report it at: "
                f"https://github.com/pgg3/evotoolkit/issues"
            )

        # Load CSV files
        info = self.dataset_info
        train_df = pd.read_csv(dataset_path / "train.csv")
        # Use in-distribution test
        test_df = pd.read_csv(dataset_path / "test_id.csv")

        # Extract inputs and outputs
        train_data = {
            "inputs": train_df[info["input_cols"]].values,
            "outputs": train_df[info["output_col"]].values,
        }
        test_data = {
            "inputs": test_df[info["input_cols"]].values,
            "outputs": test_df[info["output_col"]].values,
        }

        return train_data, test_data

    def _process_data(self, data):
        """Process input data and create task_info."""
        self.data = data
        self.task_info = {
            "dataset_name": self.dataset_name,
            "train_size": len(data["train"]["inputs"]),
            "test_size": len(data["test"]["inputs"]),
            "n_inputs": data["train"]["inputs"].shape[1],
            "max_params": self.max_params,
        }

    def _evaluate_code_impl(self, candidate_code: str) -> EvaluationResult:
        """
        Evaluate Python code for scientific symbolic regression.

        The code must define an `equation` function that will be optimized.
        """
        # Create namespace with required modules
        namespace = {
            "__builtins__": {
                "len": len,
                "range": range,
                "enumerate": enumerate,
                "zip": zip,
                "map": map,
                "filter": filter,
                "sum": sum,
                "min": min,
                "max": max,
                "abs": abs,
                "print": print,
                "str": str,
                "int": int,
                "float": float,
                "list": list,
                "dict": dict,
                "tuple": tuple,
                "set": set,
                "__import__": __import__,
            },
            "np": np,
        }

        # Execute the code
        exec(candidate_code, namespace)

        # Check if equation function exists
        if "equation" not in namespace:
            return EvaluationResult(
                valid=False,
                score=float("-inf"),
                additional_info={
                    "error": 'Function "equation" not found in code'},
            )

        equation_func = namespace["equation"]

        # Evaluate on training data
        try:
            train_score, train_warnings = self._evaluate_equation(
                equation_func, self.train_inputs, self.train_outputs
            )

            # Evaluate on test data
            test_score, test_warnings = self._evaluate_equation(
                equation_func, self.test_inputs, self.test_outputs
            )

            # Combine all warnings
            all_warnings = list(set(train_warnings + test_warnings))

            if train_score is None or test_score is None:
                return EvaluationResult(
                    valid=False,
                    score=float("-inf"),
                    additional_info={
                        "error": "Optimization failed or returned NaN/Inf",
                        "warnings": all_warnings,
                    },
                )

            # Use train_score as fitness (already -MSE, higher is better)
            # Test score is only for final evaluation, not for optimization
            score = train_score

            return EvaluationResult(
                valid=True,
                score=score,
                additional_info={
                    "train_mse": -train_score,  # Convert -MSE back to MSE for logging
                    "test_mse": -test_score,  # Convert -MSE back to MSE for logging
                    "n_params": self.max_params,
                    "warnings": all_warnings if all_warnings else [],
                },
            )

        except Exception as e:
            return EvaluationResult(
                valid=False,
                score=float("-inf"),
                additional_info={"error": f"Evaluation error: {str(e)}"},
            )

    def _evaluate_equation(self, equation_func, inputs, outputs):
        """
        Evaluate equation with parameter optimization.

        Returns:
            tuple: (score, warnings_list) where score is -MSE (higher is better), or (None, warnings) if failed
        """
        from scipy.optimize import minimize

        captured_warnings = []

        # Define loss function
        def loss(params):
            try:
                # Call equation with unpacked inputs and params
                if inputs.shape[1] == 2:
                    y_pred = equation_func(inputs[:, 0], inputs[:, 1], params)
                elif inputs.shape[1] == 4:
                    y_pred = equation_func(
                        inputs[:, 0], inputs[:, 1], inputs[:,
                                                           2], inputs[:, 3], params
                    )
                else:
                    # Generic case
                    y_pred = equation_func(
                        *[inputs[:, i] for i in range(inputs.shape[1])], params
                    )

                mse = np.mean((y_pred - outputs) ** 2)
                return mse
            except Exception:
                return 1e10  # Large penalty for errors

        # Optimize parameters with warning capture
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")

                result = minimize(
                    loss,
                    x0=[1.0] * self.max_params,
                    method="BFGS",
                    options={"maxiter": 1000},
                )

                # Collect warning messages
                for warning in w:
                    msg = f"{warning.category.__name__}: {warning.message}"
                    if msg not in captured_warnings:  # Deduplicate
                        captured_warnings.append(msg)

            final_loss = result.fun

            # Check for NaN or Inf
            if np.isnan(final_loss) or np.isinf(final_loss):
                return (None, captured_warnings)

            # Return negative MSE (higher is better) and warnings
            return (-final_loss, captured_warnings)

        except Exception:
            return (None, captured_warnings)

    def get_base_task_description(self) -> str:
        """Get task description for the specific dataset."""
        info = self.dataset_info
        input_names = info["input_cols"]
        output_name = info["output_col"]

        # Build input signature
        if len(input_names) == 2:
            signature = f"{input_names[0]}: np.ndarray, {input_names[1]}: np.ndarray, params: np.ndarray"
        elif len(input_names) == 4:
            signature = (
                ", ".join([f"{name}: np.ndarray" for name in input_names])
                + ", params: np.ndarray"
            )
        else:
            signature = (
                ", ".join(
                    [f"input{i}: np.ndarray" for i in range(len(input_names))])
                + ", params: np.ndarray"
            )

        return f"""You are an expert in scientific symbolic regression and mathematical modeling.

Task: {info["description"]}

Your goal is to discover a mathematical equation that predicts {output_name} from:
{chr(10).join(f"  - {inp}" for inp in info["inputs"])}

Requirements:
- Define a function named 'equation' with signature: equation({signature}) -> np.ndarray
- Use numpy operations for vectorized computation
- The 'params' array contains {self.max_params} optimizable constants (params[0] to params[{self.max_params - 1}])
- Return predictions as a numpy array matching the shape of inputs
- Focus on discovering the mathematical structure; parameters will be auto-optimized

Guidelines:
- Use mathematical operations: +, -, *, /, **, np.exp, np.log, np.sin, np.cos, etc.
- Combine input variables in meaningful ways based on physical intuition
- Keep equations reasonably simple to avoid overfitting
- Ensure numerical stability (avoid division by very small numbers, etc.)
- All operations must be vectorized (work on numpy arrays)

Example structure:
```python
import numpy as np

def equation({signature}) -> np.ndarray:
    # Example: linear combination
    return params[0] * {input_names[0]} + params[1] * {input_names[1] if len(input_names) > 1 else input_names[0]}
```

Fitness: Your equation will be evaluated by optimizing parameters to minimize MSE on test data.
"""

    def make_init_sol_wo_other_info(self) -> Solution:
        """Create initial solution with simple linear equation."""
        info = self.dataset_info
        input_names = info["input_cols"]

        # Build simple linear combination
        if len(input_names) == 2:
            equation_body = f"    return params[0] * {input_names[0]} + params[1] * {input_names[1]} + params[2]"
            signature = f"{input_names[0]}, {input_names[1]}, params"
        elif len(input_names) == 4:
            terms = [f"params[{i}] * {name}" for i,
                     name in enumerate(input_names)]
            equation_body = (
                f"    return {' + '.join(terms)} + params[{len(input_names)}]"
            )
            signature = ", ".join(input_names) + ", params"
        else:
            terms = [
                f"params[{i}] * input{i}" for i in range(len(input_names))]
            equation_body = (
                f"    return {' + '.join(terms)} + params[{len(input_names)}]"
            )
            signature = (
                ", ".join([f"input{i}" for i in range(
                    len(input_names))]) + ", params"
            )

        initial_code = f'''import numpy as np

def equation({signature}):
    """Linear baseline model."""
{equation_body}
'''

        # Evaluate the initial solution
        eval_res = self.evaluate_code(initial_code)

        return Solution(sol_string=initial_code, evaluation_res=eval_res, other_info={})

__init__

__init__(
    dataset_name: Literal[
        "bactgrow",
        "oscillator1",
        "oscillator2",
        "stressstrain",
    ],
    data_dir: str | Path | None = None,
    max_params: int = 10,
    timeout_seconds: float = 60.0,
)

Initialize scientific regression task.

Parameters:

Name Type Description Default
dataset_name Literal['bactgrow', 'oscillator1', 'oscillator2', 'stressstrain']

Name of the scientific dataset

required
data_dir str | Path | None

Custom data directory (optional, defaults to ~/.evotool/data/)

None
max_params int

Maximum number of optimizable parameters

10
timeout_seconds float

Execution timeout

60.0
Source code in src/evotoolkit/task/python_task/scientific_regression/scientific_regression_task.py
def __init__(
    self,
    dataset_name: Literal["bactgrow", "oscillator1", "oscillator2", "stressstrain"],
    data_dir: str | Path | None = None,
    max_params: int = 10,
    timeout_seconds: float = 60.0,
):
    """
    Initialize scientific regression task.

    Args:
        dataset_name: Name of the scientific dataset
        data_dir: Custom data directory (optional, defaults to ~/.evotool/data/)
        max_params: Maximum number of optimizable parameters
        timeout_seconds: Execution timeout
    """
    if dataset_name not in DATASET_INFO:
        raise ValueError(
            f"Unknown dataset: {dataset_name}. "
            f"Available: {list(DATASET_INFO.keys())}"
        )

    self.dataset_name = dataset_name
    self.max_params = max_params
    self.dataset_info = DATASET_INFO[dataset_name]

    # Load data
    train_data, test_data = self._load_dataset(dataset_name, data_dir)

    # Store data
    self.train_inputs = train_data["inputs"]
    self.train_outputs = train_data["outputs"]
    self.test_inputs = test_data["inputs"]
    self.test_outputs = test_data["outputs"]

    # Pass to parent
    super().__init__(
        data={"train": train_data, "test": test_data},
        timeout_seconds=timeout_seconds,
    )

get_base_task_description

get_base_task_description() -> str

Get task description for the specific dataset.

Source code in src/evotoolkit/task/python_task/scientific_regression/scientific_regression_task.py
    def get_base_task_description(self) -> str:
        """Get task description for the specific dataset."""
        info = self.dataset_info
        input_names = info["input_cols"]
        output_name = info["output_col"]

        # Build input signature
        if len(input_names) == 2:
            signature = f"{input_names[0]}: np.ndarray, {input_names[1]}: np.ndarray, params: np.ndarray"
        elif len(input_names) == 4:
            signature = (
                ", ".join([f"{name}: np.ndarray" for name in input_names])
                + ", params: np.ndarray"
            )
        else:
            signature = (
                ", ".join(
                    [f"input{i}: np.ndarray" for i in range(len(input_names))])
                + ", params: np.ndarray"
            )

        return f"""You are an expert in scientific symbolic regression and mathematical modeling.

Task: {info["description"]}

Your goal is to discover a mathematical equation that predicts {output_name} from:
{chr(10).join(f"  - {inp}" for inp in info["inputs"])}

Requirements:
- Define a function named 'equation' with signature: equation({signature}) -> np.ndarray
- Use numpy operations for vectorized computation
- The 'params' array contains {self.max_params} optimizable constants (params[0] to params[{self.max_params - 1}])
- Return predictions as a numpy array matching the shape of inputs
- Focus on discovering the mathematical structure; parameters will be auto-optimized

Guidelines:
- Use mathematical operations: +, -, *, /, **, np.exp, np.log, np.sin, np.cos, etc.
- Combine input variables in meaningful ways based on physical intuition
- Keep equations reasonably simple to avoid overfitting
- Ensure numerical stability (avoid division by very small numbers, etc.)
- All operations must be vectorized (work on numpy arrays)

Example structure:
```python
import numpy as np

def equation({signature}) -> np.ndarray:
    # Example: linear combination
    return params[0] * {input_names[0]} + params[1] * {input_names[1] if len(input_names) > 1 else input_names[0]}
```

Fitness: Your equation will be evaluated by optimizing parameters to minimize MSE on test data.
"""

make_init_sol_wo_other_info

make_init_sol_wo_other_info() -> Solution

Create initial solution with simple linear equation.

Source code in src/evotoolkit/task/python_task/scientific_regression/scientific_regression_task.py
    def make_init_sol_wo_other_info(self) -> Solution:
        """Create initial solution with simple linear equation."""
        info = self.dataset_info
        input_names = info["input_cols"]

        # Build simple linear combination
        if len(input_names) == 2:
            equation_body = f"    return params[0] * {input_names[0]} + params[1] * {input_names[1]} + params[2]"
            signature = f"{input_names[0]}, {input_names[1]}, params"
        elif len(input_names) == 4:
            terms = [f"params[{i}] * {name}" for i,
                     name in enumerate(input_names)]
            equation_body = (
                f"    return {' + '.join(terms)} + params[{len(input_names)}]"
            )
            signature = ", ".join(input_names) + ", params"
        else:
            terms = [
                f"params[{i}] * input{i}" for i in range(len(input_names))]
            equation_body = (
                f"    return {' + '.join(terms)} + params[{len(input_names)}]"
            )
            signature = (
                ", ".join([f"input{i}" for i in range(
                    len(input_names))]) + ", params"
            )

        initial_code = f'''import numpy as np

def equation({signature}):
    """Linear baseline model."""
{equation_body}
'''

        # Evaluate the initial solution
        eval_res = self.evaluate_code(initial_code)

        return Solution(sol_string=initial_code, evaluation_res=eval_res, other_info={})