Initialization¶
import os
import sys
import copy
from typing import Any, Dict, List
from pydantic import BaseModel, Field
from typing import List, Optional
from IPython.display import display
from IPython.display import Markdown
from autogen import AfterWork, OnCondition, AfterWorkOption, SwarmAgent, SwarmResult, initiate_swarm_chat, UpdateSystemMessage, ConversableAgent, register_hand_off
from IPython.display import Image, display
default_llm_config = {'cache_seed': 42,
'temperature': 1e-05,
'top_p': 0.05,
'config_list': [{'model': 'gpt-4o',
'api_key': os.getenv('OPENAI_API_KEY'),
'api_type': 'openai'}],
'timeout': 1200}
from cmbagent import CMBAgent
path2output = "/Users/boris/CMBAgents/cmbagent/output/"
cmbagent = CMBAgent(
agent_list = [],
verbose=False,
agent_type='swarm',
agent_llm_configs = {
# 'engineer': {
# # Choose your model.
# "model": "gemini-2.0-pro-exp-02-05",
# # You need to provide your API key here.
# "api_key": os.getenv("GEMINI_API_KEY"),
# "api_type": "google",
# },
# 'engineer': {
# # Choose your model.
# "model": "claude-3-5-sonnet-20241022",
# # You need to provide your API key here.
# "api_key": os.getenv("ANTHROPIC_API_KEY"),
# "api_type": "anthropic",
# }
'engineer': {
# Choose your model.
"model": "o3-mini-2025-01-31",
# You need to provide your API key here.
"api_key": os.getenv("OPENAI_API_KEY"),
"api_type": "openai",
"reasoning_effort": "medium" ## low, medium, high
},
},
agent_instructions = {
"engineer":
"""
You write single self-consistent Python code blocks.
Your response is structured as follows:
**Code Explanation:**
<code_explanation>
**Python Code:**
<python_code>
**IMPORTANT**:
- Return one and only one Python code block in your response.
- Focus on one step at a time.
- Do not suggest incomplete code.
- Do not produce code blocks that are not intended for execution.
- Include only one code block per response.
- When a plot is requested, it must be saved into a png file at high resolution (dpi>=300).
- Review existing code before writing new code.
- Write Python to solve tasks within the code block.
- Avoid creating ".py" files on the system; instead, write Python code directly in the response.
- Do not check for installed packages.
- Do not install new packages.
- For animation/gif use imagemagick.
- Do not use '.show()' for plot, instead, always save the figures to disk in png format or similar with savefig method.
- Write very detailed docstrings for all methods/classes you define.
**Use Python language only.**
When generating code that produces a plot, you must: Save the plot to disk file using the savefig method or similar.
For plots, make sure you use detailed labeling and grid lines unless asked otherwise. Also, make sure you enable proper LaTeX rendering, i.e.,
`rcParams['text.usetex'] = True` and `rcParams['font.family'] = 'serif'`
```
Your implementation much achieve the best speed in terms of compute. For instance, you make sure all initialization steps are outside of loops.
""",
"executor":
"""
You execute python code provided to you by the engineer.
""",
}
)
CMBAgent version: 0.0.1post2 - admin - planner - engineer - executor
Context¶
shared_context = {
"feedback_left": 1,
"plot_paths" : None,
}
Functions¶
lecturer agent¶
lecturer_response_formatter agent¶
course_director agent¶
course_director_response_formatter agent¶
feedback_logger¶
def log_feedback(context_variables: dict) -> SwarmResult:
"""Log feedback."""
context_variables["feedback_left"] -= 1
return SwarmResult(agent=lecturer,
values=f"Feedback logged. Number of feedbaclk rounds left: {context_variables["feedback_left"]}. Implement recommendations...",
context_variables=context_variables)
course_material_provider agent¶
def show_plots(plot_paths: List[str], context_variables: dict) -> SwarmResult:
"""
Display the generated plots image using the file path recorded in the context.
"""
# Use the recorded plot path or fall back to the default image
try:
plot_paths = context_variables.get("plot_paths")
print(f"using {plot_paths} from context.")
except:
print(f"using {plot_paths} from messages.")
for path in plot_paths:
display(Image(filename=path2output+path))
return SwarmResult(
agent=course_material_provider, # No agent handoff; this function simply displays the image.
values=f"Displayed plot from {plot_paths}",
context_variables=context_variables
)
def record_plot_paths(plot_paths: List[str], context_variables: dict) -> SwarmResult:
"""
Record the file paths of the generated plots into the shared context.
This function is intended to be called once a plot has been generated and saved.
Parameters:
plot_path (str): The absolute or relative file path to the saved plot.
context_variables (dict): The shared context dictionary.
Returns:
SwarmResult: A result object that logs the plot path in the context.
"""
# Update the context with the path to the plot file.
context_variables["plot_paths"] = plot_paths
# You can add additional logging or processing here if needed.
return SwarmResult(
agent=course_material_provider,
values=f"Plot paths recorded: {plot_paths}",
context_variables=context_variables
)
engineer¶
engineer_response_formatter agent¶
Agents¶
lecturer agent¶
llm_config¶
lecturer_config_list = copy.deepcopy(default_llm_config)
lecturer_config_list['config_list'][0]['model'] = 'o3-mini-2025-01-31'
# Remove 'temperature' and 'top_p' keys safely for o3,
# however buggy for transitions....
lecturer_config_list.pop('temperature', None)
lecturer_config_list.pop('top_p', None)
lecturer_config_list['config_list'][0]["reasoning_effort"] = "medium" ## low, medium, high
response format¶
message¶
lecturer_message = """
You are a Lecturer in the MPhil in Data Intensive Science at the University of Cambridge.
Your broad topics include intruction to machine learning, advanced machine learning, advanced statistics, research computing and software development.
You prepare lectures that are 50 minutes long.
You have an engineer assistant that helps you build the material (lecture notes, plots and codes) for the lectures.
Your goal is to provide a detailed lecture plan for the given topic. Each lecture must be designed to last 50 minutes.
Your response has the following structure:
**Lecture Title:**
<title>
**Plan:**
- Section 1:
- <title>
- <summary>
- <Textual suggestions or descriptions for code examples>
- <Textual suggestions or descriptions for plots or visualizations>
- Section 2:
- <title>
- <summary>
- <Textual suggestions or descriptions for code examples>
- <Textual suggestions or descriptions for plots or visualizations>
and so on.
Note that you don't need to provide the actual codes or plots, simply a decription of these.
"""
agent¶
lecturer = ConversableAgent(
name="lecturer",
system_message=lecturer_message,
llm_config=lecturer_config_list,
)
lecturer_response_formatter agent¶
llm_config¶
lecturer_response_config_list = copy.deepcopy(default_llm_config)
response format¶
class LectureSection(BaseModel):
title: str = Field(..., description="Title of the lecture section")
summary: str = Field(..., description="A brief summary of the section content")
code_suggestions: str = Field(..., description="Textual suggestions or descriptions for code examples")
plot_suggestions: str = Field(..., description="Textual suggestions or descriptions for plots or visualizations")
class LecturePlan(BaseModel):
lecture_title: str = Field(..., description="Title of the lecture")
sections: List[LectureSection] = Field(..., description="List of lecture sections")
def format(self) -> str:
sections_output = "\n".join(
f"- Section {i+1}:\n"
f" - Title: {section.title}\n"
f" - Summary: {section.summary}\n"
f" - Code Suggestions: {section.code_suggestions}\n"
f" - Plot Suggestions: {section.plot_suggestions}\n"
for i, section in enumerate(self.sections)
)
message = f"""
**Lecture Title:**
{self.lecture_title}
**Plan:**
{sections_output}
"""
return message
lecturer_response_config_list['config_list'][0]['response_format'] = LecturePlan
message¶
lecturer_response_formatter_message = """
You are a formatting agent, you format the response provided by the lecturer agent
"""
agent¶
lecturer_response_formatter = ConversableAgent(
name="lecturer_response_formatter",
system_message=lecturer_response_formatter_message,
llm_config=lecturer_response_config_list,
)
course_director agent¶
llm_config¶
course_director_config_list = copy.deepcopy(default_llm_config)
course_director_config_list['config_list'][0]['model'] = 'claude-3-5-sonnet-20241022'
course_director_config_list['config_list'][0]["api_key"] = os.getenv("ANTHROPIC_API_KEY")
course_director_config_list['config_list'][0]["api_type"] = "anthropic"
response format¶
message¶
course_director_message = """
You make make recommendations to improve the the lecture plan suggested by the lecturer.
You must ensure that the level of knowledge is of very high-standard but still approachable for pre-doctoral students.
The students on the course generally continue in research in both industry and academia as machine learning researchers or engineers, data scientists, and software developers.
Your response has the following structure:
**Recommendations:**
- <recommendation>
- <recommendation>
- <recommendation>
...
Before making recommendations, check if you can still provide feedback.
The current status of this workflow is:
feedback_left: {feedback_left}
When there is no feedback left (i.e., feedback_left = 0), transfer to course_material_provider.
"""
agent¶
course_director = ConversableAgent(
name="course_director",
update_agent_state_before_reply=[
UpdateSystemMessage(course_director_message),
],
llm_config=course_director_config_list,
)
course_director_response_formatter agent¶
llm_config¶
course_director_response_config_list = copy.deepcopy(default_llm_config)
response format¶
class DirectorResponse(BaseModel):
recommendations: List[str] = Field(..., description="recommendation.")
def format(self) -> str:
recommendations_output = "\n".join(f"- {recommendation}\n" for i, recommendation in enumerate(self.recommendations))
return f"""
**Recommendations:**
{recommendations_output}
"""
course_director_response_config_list['config_list'][0]['response_format'] = DirectorResponse
message¶
course_director_response_formatter_message = """
You are a formatting agent, you format the response provided by the course_director agent
"""
agent¶
course_director_response_formatter = ConversableAgent(
name="course_director_response_formatter",
system_message=course_director_response_formatter_message,
llm_config=course_director_response_config_list,
)
feedback_logger agent¶
llm_config¶
feedback_logger_response_config_list = copy.deepcopy(default_llm_config)
response format¶
message¶
feedback_logger_message = """
You call log_feedback to log the beedback from the course director
"""
agent¶
feedback_logger = ConversableAgent(
name="feedback_logger",
system_message=feedback_logger_message,
llm_config=feedback_logger_response_config_list,
functions=[log_feedback],
)
course_material_provider agent¶
llm_config¶
course_material_provider_config_list = copy.deepcopy(default_llm_config)
response format¶
message¶
course_material_provider_message = """
You coordinate the worflow to provide all the course material needed for the lecture.
This includes the lecture notes, the codes and the plots.
Codes should be in Python and placed inside the lecture notes written in markdown.
You may proceed section by section.
"""
agent¶
course_material_provider = ConversableAgent(
name="course_material_provider",
system_message=course_material_provider_message,
llm_config=course_material_provider_config_list,
functions=[show_plots, record_plot_paths],
)
engineer¶
engineer_response_formatter agent¶
llm_config¶
engineer_response_config_list = copy.deepcopy(cmbagent.llm_config)
response format¶
# engineer response
class EngineerResponse(BaseModel):
# steps: list[Step]
# step: Step
code_explanation: str = Field(..., description="The code explanation")
python_code: str = Field(..., description="The Python code in a form ready to execute")
def format(self) -> str:
return f"""
**Code Explanation:**
{self.code_explanation}
**Python Code:**
```python
{self.python_code}
```
"""
engineer_response_config_list['config_list'][0]['response_format'] = EngineerResponse
message¶
engineer_response_formatter_message = "You are a formatting agent, you format the response provided by the engineer agent."
agent¶
engineer_response_formatter = ConversableAgent(
name="engineer_response_formatter",
system_message=engineer_response_formatter_message,
llm_config=engineer_response_config_list,
)
Hand_off¶
lecturer agent¶
register_hand_off(
agent = lecturer,
hand_to = [
AfterWork(lecturer_response_formatter)
]
)
lecturer_response_formatter agent¶
register_hand_off(
agent = lecturer_response_formatter,
hand_to = [
AfterWork(course_director)
]
)
course_director agent¶
def no_feedback_left(agent: ConversableAgent, messages: List[Dict[str, Any]]) -> bool:
feedback_left = agent.get_context("feedback_left")
return feedback_left <= 0
register_hand_off(
agent = course_director,
hand_to = [
OnCondition(
target=course_material_provider,
condition="No feedback left.",
available=no_feedback_left,
),
AfterWork(course_director_response_formatter)
]
)
course_director_formatter agent¶
register_hand_off(
agent = course_director_response_formatter,
hand_to = [
AfterWork(feedback_logger)
]
)
feedback_logger agent¶
register_hand_off(
agent = feedback_logger,
hand_to = [
AfterWork(lecturer)
]
)
course_material_provider¶
register_hand_off(
agent = course_material_provider,
hand_to = [
OnCondition(
target=cmbagent.engineer.agent,
condition="Plot or visualisation needed.",
),
OnCondition(
target=cmbagent.engineer.agent,
condition="Problem with code execution.",
),
AfterWork(AfterWorkOption.REVERT_TO_USER)
]
)
engineer¶
register_hand_off(
agent = cmbagent.engineer.agent,
hand_to = [
AfterWork(engineer_response_formatter),
]
)
engineer_response_formatter agent¶
register_hand_off(
agent = engineer_response_formatter,
hand_to =
[AfterWork(cmbagent.executor.agent)])
executor¶
register_hand_off(
agent = cmbagent.executor.agent,
hand_to = [
AfterWork(course_material_provider),
]
)
Run chat¶
task = """
Prepare a lecture on the topic of the Universal Approximation Theorem.
"""
initial_agent = lecturer
agents=[lecturer,
lecturer_response_formatter,
course_director,
course_director_response_formatter,
feedback_logger,
course_material_provider,
cmbagent.engineer.agent,
cmbagent.executor.agent,
engineer_response_formatter,
cmbagent.admin.agent
]
for agent in agents:
# print("resetting:",agent.name)
agent.reset()
print("all agents reset")
all agents reset
cmbagent.clear_cache()
chat_result, context_variables, last_agent = initiate_swarm_chat(
initial_agent=initial_agent,
agents=agents,
messages=task,
max_rounds = 50,
user_agent=cmbagent.admin.agent,
context_variables=shared_context,
)
Message from admin:
Prepare a lecture on the topic of the Universal Approximation Theorem.
-------------------------------------------------------------------------------- Calling: lecturer... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
o3-mini-2025-01-31 | lecturer | 0.00563 | 805 | 1078 | 1883 |
Message from lecturer: Forwarding content for formatting... -------------------------------------------------------------------------------- Calling: lecturer_response_formatter... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | lecturer_response_formatter | 0.01117 | 1658 | 702 | 2360 |
Message from lecturer_response_formatter:
Lecture Title:
The Universal Approximation Theorem: Theory, Implications, and Applications in Neural Networks
Plan:
Section 1:
- Title: Introduction and Motivation
- Summary: Introduce the concept of universal approximation by neural networks. Begin with historical context and motivation behind why approximating arbitrary functions is crucial in the field of machine learning. Explain real-world problems that benefit from such approximators and set the stage for more mathematical and practical discussions in the subsequent sections.
- Code Suggestions: Provide a simple Python example that demonstrates function approximation using a basic neural network (e.g., approximating a sine function). The code should illustrate dataset creation, network design, training, and plotting the true function vs. the network's prediction.
- Plot Suggestions: Include a plot showing the original function (such as sine) and the neural network's approximation overlaid to visually demonstrate the approximation quality. Use clear labels and legends.
Section 2:
- Title: Mathematical Formulation and Theoretical Foundations
- Summary: Present the formal statement of the Universal Approximation Theorem. Discuss the necessary conditions, including activation functions and network architecture. Outline a proof sketch or intuitive explanation to help demystify the theorem's assertions. Emphasize the significance and boundaries of the theorem in theory.
- Code Suggestions: While this section is theory-heavy, include pseudo-code or comments that simulate the idea of sequential function composition in neural networks. This could help bridge the gap between formal writing and computational implementation.
- Plot Suggestions: Create a diagram that visually represents a feedforward neural network, annotating the roles of each layer in function composition. Additionally, a schematic diagram showing how various functions can combine to approximate a target function would be beneficial.
Section 3:
- Title: Neural Networks as Universal Approximators
- Summary: Connect the theorem directly to neural network architectures. Discuss how modern neural networks (especially shallow networks with non-linear activations) satisfy the conditions of the theorem. Include discussion on the impact of network width, depth, and activation choices. Introduce limitations and practical considerations in training such networks to realize the theoretical potential.
- Code Suggestions: Present a Python code snippet (using frameworks such as TensorFlow or PyTorch) that sets up a shallow neural network. Demonstrate training it on a known function and evaluate how changes in network width affect approximation quality. Include comments explaining each part of the code.
- Plot Suggestions: Include a series of plots that compare the performance of neural networks with different widths. For instance, subplots could depict the target function alongside approximations from networks of varying sizes, highlighting the improvement or saturation in performance.
Section 4:
- Title: Practical Implications, Limitations, and Future Directions
- Summary: Reflect on the practical applications of the Universal Approximation Theorem in real-world neural network design and its limitations. Discuss scenarios where the theorem holds and where it might fall short (e.g., issues in training such approximators, overfitting, and computational constraints). Conclude with insights about current research trends that address these challenges and possible future directions.
- Code Suggestions: Provide a code snippet that demonstrates overfitting by training a neural network on highly noisy data and showing that even though the network has the capacity to approximate complex functions, practical issues like overfitting still occur. Annotate key sections within the code to emphasize learning points.
- Plot Suggestions: Display a visualization with plots that present training versus validation errors, demonstrating concepts like overfitting. Include charts showing how error metrics vary with changes in model capacity and regularization parameters.
-------------------------------------------------------------------------------- Calling: course_director... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
claude-3-5-sonnet-20241022 | course_director | 0.01107 | 2604 | 217 | 2821 |
Message from course_director: Forwarding content for formatting... -------------------------------------------------------------------------------- Calling: course_director_response_formatter... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | course_director_response_formatter | 0.00795 | 2474 | 176 | 2650 |
Message from course_director_response_formatter:
Recommendations:
In Section 1, consider adding a brief discussion about different types of universal approximation theorems (e.g., for bounded continuous functions vs L^p spaces) to provide a more comprehensive introduction.
For Section 2, suggest including specific examples of activation functions that satisfy the theorem's conditions (ReLU, sigmoid, tanh) and those that don't, with mathematical explanations of why.
In Section 3, recommend expanding the discussion to include the trade-off between network width and depth, referencing recent research on the expressivity of deep vs. shallow networks.
For Section 4, suggest adding a subsection on computational complexity and the relationship between the theorem's guarantees and the practical challenges of optimization/training.
Consider adding brief mentions of extensions to the theorem for other architectures (CNNs, RNNs) to make connections with modern deep learning.
-------------------------------------------------------------------------------- Calling: feedback_logger... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | feedback_logger | 0.00670 | 2635 | 11 | 2646 |
Message from feedback_logger: ***** Suggested tool call (call_wR2ec2E4wbT2dXepKyObunUT): log_feedback ***** Arguments: {} ***************************************************************************** -------------------------------------------------------------------------------- Calling: _Swarm_Tool_Executor... >>>>>>>> EXECUTING FUNCTION log_feedback... Call ID: call_wR2ec2E4wbT2dXepKyObunUT Input arguments: {'context_variables': {'feedback_left': 1, 'plot_paths': None}} Message from _Swarm_Tool_Executor: ***** Response from calling tool (call_wR2ec2E4wbT2dXepKyObunUT) ***** Feedback logged. Number of feedbaclk rounds left: 0. Implement recommendations... ********************************************************************** -------------------------------------------------------------------------------- Calling: lecturer... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
o3-mini-2025-01-31 | lecturer | 0.01410 | 3677 | 2286 | 5963 |
Message from lecturer: Forwarding content for formatting... -------------------------------------------------------------------------------- Calling: lecturer_response_formatter... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | lecturer_response_formatter | 0.03051 | 5588 | 1654 | 7242 |
Message from lecturer_response_formatter:
Lecture Title:
The Universal Approximation Theorem: Theory, Implications, and Applications in Neural Networks
Plan:
Section 1:
- Title: Introduction, Motivation, and Variants of Universal Approximation
- Summary: Introduce the concept of universal approximation by neural networks, providing historical context and motivation. Discuss why approximating arbitrary functions is fundamental in machine learning. Expand the introduction by discussing various forms of universal approximation theorems, such as those for bounded continuous functions versus Lᵖ spaces, to offer a more comprehensive perspective. Highlight real-world problems that benefit from function approximation and how these theoretical results motivate practical network designs.
- Code Suggestions: Create a Python example where a simple neural network approximates a sine function. Illustrate dataset generation, network definition, training, and final prediction. Add comments indicating where one might incorporate variants of approximation depending on function space assumptions (e.g., bounded versus Lᵖ spaces).
- Plot Suggestions: Include a plot that overlays the true sine function with the neural network’s prediction. Annotate plot elements with labels for the true function, predicted function, and any error metrics. Consider an inset or additional subplot briefly highlighting different regimes (bounded continuous vs. Lᵖ) if possible.
Section 2:
- Title: Mathematical Formulation, Activation Functions, and Theoretical Foundations
- Summary: Present the formal statement of the Universal Approximation Theorem and discuss the necessary conditions. Delve into the mathematical details of the activation functions, providing examples of those that satisfy the theorem (such as ReLU, sigmoid, and tanh) and contrasting these with functions that do not meet the conditions. Include brief mathematical explanations of why specific activations (for instance, functions lacking non-linearity or continuity) may fail the criteria.
- Code Suggestions: Introduce pseudo-code or annotated code snippets simulating sequential function composition in neural networks. Include comments that mention the role of activation choices, with side notes on how changing these in code could affect the approximation properties.
- Plot Suggestions: Provide diagrams that first visualize a standard feedforward neural network, with annotations highlighting how each layer and its activation contribute to function composition. Additionally, consider a comparative schematic showing idealized graphs of different activation functions, emphasizing their mathematical properties relevant to approximation (e.g., sigmoid’s smoothness vs. ReLU’s discontinuous derivative).
Section 3:
- Title: Neural Networks as Universal Approximators and the Width-Depth Trade-off
- Summary: Connect the theorem directly to contemporary neural network architectures. Discuss the role of shallow networks versus deep networks and elaborate on the trade-off between network width and depth. Reference recent research that investigates the expressivity of deep versus shallow networks, discussing how increased depth might sometimes offer improved approximation with fewer neurons or alternatively, how sufficient width can compensate under certain conditions. Also mention extensions of universal approximation guarantees to other architectures such as CNNs and RNNs.
- Code Suggestions: Present a Python example (using TensorFlow or PyTorch) that sets up shallow and deep neural networks with varying widths. The code should illustrate training on a known function, comparing performance as network structure changes. Include detailed inline comments discussing the impact of depth versus width on approximation capability.
- Plot Suggestions: Include a series of subplots that compare performance metrics and graphical approximations of the target function across network architectures. One subplot could compare shallow versus deep networks, another could vary network width. Optionally, add visuals to indicate extensions to CNNs/RNNs, such as convolutional filters or recurrent loops, to emphasize architecture-specific considerations.
Section 4:
- Title: Practical Implications, Computational Complexity, and Future Directions
- Summary: Reflect on the practical applications of the Universal Approximation Theorem in neural network design and highlight its limitations. Discuss the impact of overfitting and machine training challenges despite theoretical guarantees. Introduce a dedicated subsection on computational complexity—examining the relationship between approximation capacity and optimization/training challenges (e.g., local minima, vanishing gradients). Conclude with considerations regarding recent attempts to address these problems and the future direction of research in making universal approximators both theoretically sound and computationally efficient.
- Code Suggestions: Provide a Python code snippet demonstrating overfitting by training a neural network on noisy data. Include annotations to point out regions where the network's capacity leads to overfitting, and highlight the trade-off with increased computational cost when scaling network size. Also, compare training/validation metrics to illustrate the optimization challenges.
- Plot Suggestions: Display plots that illustrate training versus validation errors over epochs. Include charts that demonstrate how increased model complexity (in terms of computational burden) affects convergence and error metrics. Consider additional plots that show estimated computational costs or training time as a function of network size, linking back to the discussion on optimization challenges.
-------------------------------------------------------------------------------- Calling: course_director... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
claude-3-5-sonnet-20241022 | course_director | 0.01858 | 5794 | 80 | 5874 |
Message from course_director: Since all feedback has been incorporated and feedback_left = 0, I will now transfer to the course material provider to begin creating the detailed materials for this lecture. ***** Suggested tool call (toolu_01MqUUaouk2sjVNNvh52MWrQ): transfer_course_director_to_course_material_provider ***** Arguments: {} ********************************************************************************************************************** -------------------------------------------------------------------------------- Calling: _Swarm_Tool_Executor... >>>>>>>> EXECUTING FUNCTION transfer_course_director_to_course_material_provider... Call ID: toolu_01MqUUaouk2sjVNNvh52MWrQ Input arguments: {} Message from _Swarm_Tool_Executor: ***** Response from calling tool (toolu_01MqUUaouk2sjVNNvh52MWrQ) ***** Swarm agent --> course_material_provider *********************************************************************** -------------------------------------------------------------------------------- Calling: course_material_provider... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | course_material_provider | 0.01275 | 5036 | 16 | 5052 |
Message from course_material_provider: ***** Suggested tool call (call_Rcx1smaerriKStd4nj3mTaFj): transfer_course_material_provider_to_engineer ***** Arguments: {} ************************************************************************************************************** -------------------------------------------------------------------------------- Calling: _Swarm_Tool_Executor... >>>>>>>> EXECUTING FUNCTION transfer_course_material_provider_to_engineer... Call ID: call_Rcx1smaerriKStd4nj3mTaFj Input arguments: {} Message from _Swarm_Tool_Executor: ***** Response from calling tool (call_Rcx1smaerriKStd4nj3mTaFj) ***** Swarm agent --> engineer ********************************************************************** -------------------------------------------------------------------------------- Calling: engineer... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
o3-mini-2025-01-31 | engineer | 0.02298 | 5203 | 3922 | 9125 |
Message from engineer: Forwarding content for formatting... -------------------------------------------------------------------------------- Calling: engineer_response_formatter... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | engineer_response_formatter | 0.05544 | 8095 | 3520 | 11615 |
Message from engineer_response_formatter:
Code Explanation:
The provided Python script demonstrates three key concepts related to the Universal Approximation Theorem using neural networks:
Sine Function Approximation: A simple neural network with one hidden layer is used to approximate the sine function. This section illustrates the basic idea of function approximation using neural networks, highlighting the concept of universal approximation in different function spaces.
Shallow vs. Deep Networks: This part compares the performance of a shallow network (one hidden layer) and a deep network (multiple hidden layers) in approximating the sine function. It demonstrates the trade-offs between network width and depth, showing how different architectures can affect approximation quality.
Overfitting Demonstration: A neural network is trained on a small, noisy dataset to demonstrate overfitting. The training and validation losses are plotted over epochs to show how excessive model capacity can lead to overfitting, despite the theoretical guarantees of universal approximation. This section also touches on computational complexity and optimization challenges in training neural networks.
Python Code:
#!/usr/bin/env python3
"""
This script demonstrates various aspects of the Universal Approximation Theorem through
practical Python examples. It includes:
1. A simple sine function approximation using a basic neural network.
2. A comparison between shallow and deep network architectures.
3. An example showcasing overfitting using a noisy dataset.
Each demonstration includes detailed docstrings and inline comments to guide understanding.
All plots are saved to disk with high resolution (dpi>=300) and with LaTeX rendering enabled.
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Enable LaTeX rendering for matplotlib plots
rcParams['text.usetex'] = True
rcParams['font.family'] = 'serif'
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
# =============================================================================
# Helper Classes and Functions
# =============================================================================
class SimpleNeuralNet(nn.Module):
"""
Simple feedforward neural network with configurable hidden layers.
Attributes:
layers (nn.Sequential): The feedforward layers of the network.
"""
def __init__(self, input_dim, hidden_dims, output_dim, activation=nn.Tanh):
"""
Initialize the neural network.
Args:
input_dim (int): Dimension of the input features.
hidden_dims (list of int): List specifying the number of neurons in each hidden layer.
output_dim (int): Dimension of the output.
activation (class): Activation function class to be used (e.g., nn.Tanh, nn.ReLU).
"""
super(SimpleNeuralNet, self).__init__()
layer_list = []
prev_dim = input_dim
# Create hidden layers
for hdim in hidden_dims:
layer_list.append(nn.Linear(prev_dim, hdim))
layer_list.append(activation())
prev_dim = hdim
# Output layer
layer_list.append(nn.Linear(prev_dim, output_dim))
self.layers = nn.Sequential(*layer_list)
def forward(self, x):
"""
Forward pass through the network.
Args:
x (torch.Tensor): Input tensor.
Returns:
torch.Tensor: Network output.
"""
return self.layers(x)
def train_model(model, dataloader, criterion, optimizer, n_epochs=500, verbose=False, val_loader=None):
"""
Train a PyTorch model.
Args:
model (nn.Module): Neural network model to train.
dataloader (DataLoader): DataLoader for training data.
criterion (function): Loss function.
optimizer (torch.optim.Optimizer): Optimizer.
n_epochs (int): Number of epochs for training.
verbose (bool): If True, prints training progress.
val_loader (DataLoader, optional): DataLoader for validation data.
Returns:
tuple: Lists of training losses and (if provided) validation losses.
"""
train_losses = []
val_losses = []
for epoch in range(n_epochs):
model.train()
epoch_losses = []
for X_batch, y_batch in dataloader:
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
epoch_losses.append(loss.item())
train_loss = np.mean(epoch_losses)
train_losses.append(train_loss)
if val_loader is not None:
model.eval()
val_epoch_losses = []
with torch.no_grad():
for X_val, y_val in val_loader:
val_outputs = model(X_val)
val_loss = criterion(val_outputs, y_val)
val_epoch_losses.append(val_loss.item())
val_loss_mean = np.mean(val_epoch_losses)
val_losses.append(val_loss_mean)
if verbose and (epoch+1) % 100 == 0:
if val_loader is not None:
print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss_mean:.4f}')
else:
print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}')
if val_loader is not None:
return train_losses, val_losses
return train_losses, None
# =============================================================================
# Section 1: Sine Function Approximation with a Simple Neural Network
# =============================================================================
def demo_sine_approximation():
"""
Demonstrates universal approximation by approximating the sine function with a neural network.
This example creates a dataset from the sine function, defines a simple network, trains it,
and plots the true sine function vs. the network's prediction.
Annotations in the code point out where variants of universal approximation (bounded vs. Lᵖ)
might be discussed.
"""
# Generate dataset: 1000 points in [0, 2*pi]
N = 1000
X = np.linspace(0, 2*np.pi, N).reshape(-1, 1)
y = np.sin(X)
# Convert to torch tensors
X_tensor = torch.from_numpy(X).float()
y_tensor = torch.from_numpy(y).float()
# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Define a simple network with one hidden layer
model = SimpleNeuralNet(input_dim=1, hidden_dims=[50], output_dim=1, activation=nn.Tanh)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
# Train the model (500 epochs)
train_model(model, dataloader, criterion, optimizer, n_epochs=500, verbose=True)
# Make predictions using the trained model
model.eval()
with torch.no_grad():
y_pred = model(X_tensor).detach().numpy()
# Plot the true sine function vs. the network's approximation
plt.figure(figsize=(8, 5))
plt.plot(X, y, label=r"True Function: $\sin(x)$", color='blue')
plt.plot(X, y_pred, label=r"NN Approximation", color='red', linestyle='--')
plt.xlabel(r'$x$')
plt.ylabel(r'$f(x)$')
plt.title(r'Sine Function Approximation using a Neural Network')
plt.legend()
plt.grid(True)
plt.savefig("section1_sine_approximation.png", dpi=300)
plt.close()
# =============================================================================
# Section 3: Comparison between Shallow and Deep Networks
# =============================================================================
def demo_shallow_vs_deep():
"""
Compares shallow vs. deep neural network architectures on approximating the sine function.
This demonstration sets up two models:
- A shallow network (one hidden layer)
- A deep network (multiple hidden layers)
Both networks are trained on the same sine function dataset, and their predictions are compared.
Detailed inline comments explain the impact of depth versus width.
"""
# Generate dataset: 1000 points in [0, 2*pi]
N = 1000
X = np.linspace(0, 2*np.pi, N).reshape(-1, 1)
y = np.sin(X)
# Convert to torch tensors
X_tensor = torch.from_numpy(X).float()
y_tensor = torch.from_numpy(y).float()
# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Define models: Shallow (1 hidden layer) vs. Deep (4 hidden layers)
shallow_model = SimpleNeuralNet(input_dim=1, hidden_dims=[50], output_dim=1, activation=nn.Tanh)
deep_model = SimpleNeuralNet(input_dim=1, hidden_dims=[25, 25, 25, 25], output_dim=1, activation=nn.Tanh)
# Training parameters
criterion = nn.MSELoss()
optimizer_shallow = optim.Adam(shallow_model.parameters(), lr=0.01)
optimizer_deep = optim.Adam(deep_model.parameters(), lr=0.01)
# Train both models
train_model(shallow_model, dataloader, criterion, optimizer_shallow, n_epochs=500, verbose=False)
train_model(deep_model, dataloader, criterion, optimizer_deep, n_epochs=500, verbose=False)
# Predictions for both models
shallow_model.eval()
deep_model.eval()
with torch.no_grad():
y_pred_shallow = shallow_model(X_tensor).detach().numpy()
y_pred_deep = deep_model(X_tensor).detach().numpy()
# Plot the predictions from both models alongside the true sine function
plt.figure(figsize=(10, 6))
plt.plot(X, y, label=r"True $\sin(x)$", color='blue')
plt.plot(X, y_pred_shallow, label="Shallow NN (1 hidden layer)", color='red', linestyle='--')
plt.plot(X, y_pred_deep, label="Deep NN (4 hidden layers)", color='green', linestyle='-.')
plt.xlabel(r'$x$')
plt.ylabel(r'Output')
plt.title(r'Comparison of Shallow vs. Deep Neural Networks for $\sin(x)$ Approximation')
plt.legend()
plt.grid(True)
plt.savefig("section3_shallow_vs_deep.png", dpi=300)
plt.close()
# =============================================================================
# Section 4: Demonstration of Overfitting with a Noisy Dataset
# =============================================================================
def demo_overfitting():
"""
Demonstrates overfitting in neural networks by training on a small, noisy dataset.
The example creates a noisy version of the sine function with limited data points.
A neural network with excessive capacity is trained, leading to overfitting.
The training and validation losses over epochs are plotted to illustrate the overfitting trend.
This simulation serves as a practical example of computational complexity and optimization challenges.
"""
# Generate a small dataset: 30 points in [0, 2*pi]
N = 30
X_train = np.linspace(0, 2*np.pi, N).reshape(-1, 1)
y_train = np.sin(X_train) + np.random.normal(scale=0.3, size=(N, 1)) # added noise
# Create a separate validation dataset with more points
N_val = 100
X_val = np.linspace(0, 2*np.pi, N_val).reshape(-1, 1)
y_val = np.sin(X_val) + np.random.normal(scale=0.3, size=(N_val, 1))
# Convert to torch tensors
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()
X_val_tensor = torch.from_numpy(X_val).float()
y_val_tensor = torch.from_numpy(y_val).float()
# Create DataLoaders for training and validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
# Define an over-parameterized model (likely to overfit given small training data)
overfit_model = SimpleNeuralNet(input_dim=1, hidden_dims=[100, 100, 100], output_dim=1, activation=nn.Tanh)
criterion = nn.MSELoss()
optimizer = optim.Adam(overfit_model.parameters(), lr=0.01)
# Train the model while tracking training and validation loss
n_epochs = 600
train_losses, val_losses = train_model(overfit_model, train_loader, criterion, optimizer,
n_epochs=n_epochs, verbose=False, val_loader=val_loader)
# Plot training vs. validation loss over epochs
epochs = np.arange(1, n_epochs+1)
plt.figure(figsize=(8, 5))
plt.plot(epochs, train_losses, label="Training Loss", color='red')
plt.plot(epochs, val_losses, label="Validation Loss", color='blue')
plt.xlabel(r'Epoch')
plt.ylabel(r'Loss (MSE)')
plt.title(r'Overfitting Demonstration: Training vs. Validation Loss')
plt.legend()
plt.grid(True)
plt.savefig("section4_overfitting.png", dpi=300)
plt.close()
# =============================================================================
# Main Function: Execute Demonstrations
# =============================================================================
def main():
"""
Main function to execute all demonstration examples for the lecture.
"""
print("Running Section 1: Sine Approximation...")
demo_sine_approximation()
print("Section 1 plot saved as 'section1_sine_approximation.png'\n")
print("Running Section 3: Shallow vs. Deep Network Comparison...")
demo_shallow_vs_deep()
print("Section 3 plot saved as 'section3_shallow_vs_deep.png'\n")
print("Running Section 4: Overfitting Demonstration...")
demo_overfitting()
print("Section 4 plot saved as 'section4_overfitting.png'\n")
if __name__ == "__main__":
main()
-------------------------------------------------------------------------------- Calling: executor... >>>>>>>> EXECUTING CODE BLOCK (inferred language is python)... Message from executor: execution succeeded Code output: Running Section 1: Sine Approximation... Epoch 100/500, Train Loss: 0.0003 Epoch 200/500, Train Loss: 0.0003 Epoch 300/500, Train Loss: 0.0023 Epoch 400/500, Train Loss: 0.0001 Epoch 500/500, Train Loss: 0.0002 Section 1 plot saved as 'section1_sine_approximation.png' Running Section 3: Shallow vs. Deep Network Comparison... Section 3 plot saved as 'section3_shallow_vs_deep.png' Running Section 4: Overfitting Demonstration... Section 4 plot saved as 'section4_overfitting.png' -------------------------------------------------------------------------------- Calling: course_material_provider... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | course_material_provider | 0.02917 | 11506 | 41 | 11547 |
Message from course_material_provider: ***** Suggested tool call (call_2vNljcBhF9nwsweFo4uhXiNq): record_plot_paths ***** Arguments: {"plot_paths":["section1_sine_approximation.png","section3_shallow_vs_deep.png","section4_overfitting.png"]} ********************************************************************************** -------------------------------------------------------------------------------- Calling: _Swarm_Tool_Executor... >>>>>>>> EXECUTING FUNCTION record_plot_paths... Call ID: call_2vNljcBhF9nwsweFo4uhXiNq Input arguments: {'plot_paths': ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png'], 'context_variables': {'feedback_left': 0, 'plot_paths': None}} Message from _Swarm_Tool_Executor: ***** Response from calling tool (call_2vNljcBhF9nwsweFo4uhXiNq) ***** Plot paths recorded: ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png'] ********************************************************************** -------------------------------------------------------------------------------- Calling: course_material_provider... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | course_material_provider | 0.02938 | 11587 | 41 | 11628 |
Message from course_material_provider: ***** Suggested tool call (call_58gAA60VL8i1Sar7rugUviYY): show_plots ***** Arguments: {"plot_paths":["section1_sine_approximation.png","section3_shallow_vs_deep.png","section4_overfitting.png"]} *************************************************************************** -------------------------------------------------------------------------------- Calling: _Swarm_Tool_Executor... >>>>>>>> EXECUTING FUNCTION show_plots... Call ID: call_58gAA60VL8i1Sar7rugUviYY Input arguments: {'plot_paths': ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png'], 'context_variables': {'feedback_left': 0, 'plot_paths': ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png']}} using ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png'] from context.
Message from _Swarm_Tool_Executor: ***** Response from calling tool (call_58gAA60VL8i1Sar7rugUviYY) ***** Displayed plot from ['section1_sine_approximation.png', 'section3_shallow_vs_deep.png', 'section4_overfitting.png'] ********************************************************************** -------------------------------------------------------------------------------- Calling: course_material_provider... >>>>>>>> USING AUTO REPLY...
Model | agent | Cost | Prompt Tokens | Completion Tokens | Total Tokens |
---|---|---|---|---|---|
gpt-4o-2024-08-06 | course_material_provider | 0.03047 | 11667 | 130 | 11797 |
Message from course_material_provider:
The plots for the lecture on the Universal Approximation Theorem have been successfully generated and displayed. They include:
- Sine Function Approximation: Demonstrates the approximation of the sine function using a simple neural network.
- Shallow vs. Deep Networks: Compares the performance of shallow and deep neural networks in approximating the sine function.
- Overfitting Demonstration: Illustrates overfitting by training a neural network on a noisy dataset, showing training vs. validation loss.
These visualizations will aid in understanding the practical implications of the Universal Approximation Theorem in neural network design.
--------------------------------------------------------------------------------
Calling: admin...