import numpy as np
import matplotlib.pyplot as plt


# Function to generate synthetic data
def generate_data(n_samples, n_features):
    np.random.seed(0)
    theta_true = np.random.randn(n_features, 1)
    X = 2 * np.random.rand(n_samples, n_features)
    y = X.dot(theta_true) + np.random.randn(n_samples, 1)
    idx = np.random.choice(n_samples, size=int(0.7*n_samples), replace=False)
    X_subset = np.delete(X, idx, axis=0)
    y_subset = np.delete(y, idx, axis=0)
    return theta_true, X, y, X_subset, y_subset


# Define gradient descent function for linear regression
def gradient_descent(X, y, learning_rate=0.01, n_iterations=10000):
    n, d = X.shape
    theta = np.random.randn(d, 1)  # random initialization
    thetas = []
    losses = []
    
    for iteration in range(n_iterations):
        gradients = 2/n * X.T.dot(X.dot(theta) - y)
        theta = theta - learning_rate * gradients
        thetas.append(theta)
        
        # Calculate loss (Mean Squared Error)
        loss = np.mean((X.dot(theta) - y) ** 2)
        losses.append(loss)
    
    return thetas, losses


# Generate synthetic data
n_features = 75
n_samples = 100

theta_true, X, y, X_subset, y_subset = generate_data(100, n_features)


# Train linear regression models
thetas_rich, losses_rich = gradient_descent(X, y)
thetas_poor, losses_poor = gradient_descent(X_subset, y_subset)


# Plot the convergence of losses for data-rich and data-poor regimes
plt.figure(figsize=(10, 5))
plt.plot(losses_rich[0:500],  color='blue')
plt.xlabel('Iteration')
plt.ylabel('Mean Squared Error (Loss)')
plt.title('Loss Convergence under 100 Samples (Data-Rich Regime)')
plt.legend()
plt.grid(True)
plt.show()

No handles with labels found to put in legend.


# Plot the convergence of losses for data-rich and data-poor regimes
plt.figure(figsize=(10, 5))
plt.plot(losses_poor[0:500], color='red')
plt.xlabel('Iteration')
plt.ylabel('Mean Squared Error (Loss)')
plt.title('Loss Convergence under 30 Samples (Data-Poor Regime)')
plt.legend()
plt.grid(True)
plt.show()

No handles with labels found to put in legend.


# Function to calculate squared distance
def squared_distances(thetas,theta_opt):
    distances = [np.linalg.norm(theta-theta_opt)**2 for theta in thetas]
    return distances


distances_true_rich = squared_distances(thetas_rich,theta_true)
distances_true_poor = squared_distances(thetas_poor,theta_true)


# Plot the convergence of distances to the true model for the data-rich and data-poor regimes
plt.figure(figsize=(10, 5))
plt.plot(distances_true_rich[0:5000], label='Data-Rich Regime', color='blue')
plt.plot(distances_true_poor[0:5000], label='Data-Poor Regime', color='red')
plt.plot(range(5000), [np.linalg.norm(theta_true) ** 2]*5000, label='Squared Norm of the True Model', color='green')
plt.xlabel('Iteration')
plt.ylabel('Squared Distance to the True Model')
plt.title('Convergence to the True Model')
plt.legend()
plt.grid(True)
plt.show()


theta_opt_rich = np.linalg.pinv(X).dot(y)
theta_opt_poor = np.linalg.pinv(X).dot(y)


distances_opt_rich = squared_distances(thetas_rich,theta_opt_rich)
distances_opt_poor = squared_distances(thetas_poor,theta_opt_poor)


# Plot the convergence of distances to the optimal model under the data-poor regime
plt.figure(figsize=(10, 5))
plt.plot(distances_opt_poor[0:5000], label='Data-Poor Regime', color='red')
plt.plot(range(5000), [np.linalg.norm(theta_opt_poor) ** 2]*5000, label='Squared Norm of the Optimal Model', color='purple')
plt.xlabel('Iteration')
plt.ylabel('Squared Distance to the Optimal Model')
plt.title('Convergence to the Optimal Model (Data-Poor Regime)')
plt.legend()
plt.grid(True)
plt.show()


# Plot the convergence of distances to the optimal model under the data-rich regime
plt.figure(figsize=(10, 5))
plt.plot(distances_opt_rich[0:5000], label='Data-Rich Regime', color='blue')
plt.plot(range(5000), [np.linalg.norm(theta_opt_rich) ** 2]*5000, label='Squared Norm of the Optimal Model', color='purple')
plt.xlabel('Iteration')
plt.ylabel('Squared Distance to the Optimal Model')
plt.title('Convergence to the Optimal Model (Data-Rich Regime)')
plt.legend()
plt.grid(True)
plt.show()


# Define gradient descent function for linear regression with L2 regularization
def gradient_descent_with_regularization(X, y, learning_rate=0.01, l2_penalty=0.01, n_iterations=10000):
    n, d = X.shape
    theta = np.random.randn(d, 1)  # random initialization
    thetas = []
    losses = []
    
    for iteration in range(n_iterations):
        # Compute gradient of the loss function
        gradients = 2/n * X.T.dot(X.dot(theta) - y)
        
        # Add gradient of L2 regularization term
        gradients += 2 * l2_penalty * theta
        
        # Update parameter using gradient descent
        theta = theta - learning_rate * gradients
        thetas.append(theta)
        
        # Calculate loss (Mean Squared Error)
        loss = np.mean((X.dot(theta) - y) ** 2) + l2_penalty * np.sum(theta**2)  # L2 regularization term
        losses.append(loss)
    
    return thetas, losses


l2_penalty=0.01
theta_opt_poor_l2 = np.linalg.inv(X.T.dot(X)+n_samples*l2_penalty*np.eye(n_features)).dot(X.T.dot(y))


# Train linear regression models
thetas_poor_l2, losses_poor_l2 = gradient_descent_with_regularization(X_subset, y_subset)


distances_true_poor_l2 = squared_distances(thetas_poor_l2,theta_true)


# Plot the convergence of distances to the true model under the data-poor regime
plt.figure(figsize=(10, 5))
plt.plot(distances_true_poor_l2, label='Data-Poor Regime with Regularization', color='orange')
plt.plot(distances_true_poor, label='Data-Poor Regime', color='red')
plt.plot(range(10000), [np.linalg.norm(theta_true) ** 2]*10000, label='Squared Norm of the True Model', color='green')
plt.xlabel('Iteration')
plt.ylabel('Squared Distance to the True Model')
plt.title('Convergence to the True Model (Data-Poor Regime)')
plt.legend()
plt.grid(True)
plt.show()


distances_opt_poor_l2 = squared_distances(thetas_poor_l2,theta_opt_poor)


# Plot the convergence of distances to the optimal model under the data-poor regime
plt.figure(figsize=(10, 5))
plt.plot(distances_opt_poor_l2, label='Data-Poor Regime with Regularization', color='orange')
plt.plot(distances_opt_poor, label='Data-Poor Regime', color='red')
plt.plot(range(10000), [np.linalg.norm(theta_opt_poor_l2) ** 2]*10000, label='Squared Norm of the Optimal Model', color='purple')
plt.xlabel('Iteration')
plt.ylabel('Squared Distance to the Optimal Model')
plt.title('Convergence to the Optimal Model (Data-Poor Regime)')
plt.legend()
plt.grid(True)
plt.show()