diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b3664f --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Built Python extension dropped next to qengine/__init__.py for local dev +/qengine/*.so +/qengine/*.dylib +/qengine/__pycache__/ + +/skbuild-build/ + +/build/ +/.idea/ +**/__pycache__/ +/docs/html/ +/docs/latex/ + +# Local reference tree (optional clone) +/CPP-design-pattern-derivatives-pricing/ + +# Local environment and secrets +.env +.env.* +!.env.example + +# Local tooling caches +/.pycache/ +/.mplconfig/ diff --git a/README.md b/README.md index d1bb550..896381a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,79 @@ -# pricing +# option_pricing -Monte Carlo pricing of European options under Black–Scholes +C++/Python quantitative finance engine for option pricing, implied-volatility analysis, and market-data ingestion. -### Project structure +## What is included + +- `cpp/`: core C++ pricing library (Monte Carlo + Black-Scholes closed form), DB ingestion hooks, and pybind bindings. +- `qengine/`: Python package exposing the native extension (`import qengine`). +- `src/ImpliedVolatility/`: SVI calibration and implied-volatility tooling. +- `src/data/`: data ingestion, SQL schema, and analytics helpers. +- `tests/`: C++ unit tests (GoogleTest). +- `scripts/`: operational scripts, including PostgreSQL setup. +- `docs/`: Doxygen configuration and generated API docs (ignored in git for publication). + +## Quickstart + +### 1) Clone and create a Python environment + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install --upgrade pip +pip install -e . +pip install pandas yfinance sqlalchemy psycopg2-binary matplotlib scipy +``` + +### 2) Configure environment variables + +```bash +cp .env.example .env +``` + +Then edit `.env` with your local database credentials. + +### 3) Create database and schema + +Use the idempotent setup script: + +```bash +source .env +python scripts/setup_postgres.py +``` + +This script creates/updates: +- database role (`DB_USER`) +- database (`DB_NAME`) +- tables/indexes from `src/data/sql/schema.sql` + +### 4) Build C++ extension and run tests + +```bash +cmake -S . -B build +cmake --build build -j +ctest --test-dir build --output-on-failure +``` + +### 5) Run Yahoo options ingestion + +```bash +source .env +python src/data/ingestion/ingest_yahoo_options.py +``` + +`PIPELINE_SYMBOLS` in `.env` controls which symbols are ingested (comma-separated, e.g. `SPY,AAPL,QQQ`). + +## Security and publication notes + +- No credentials are stored in source code. +- `.env` files are git-ignored; only `.env.example` is committed. +- Before publishing, rotate any credentials that were ever committed in the past. +- Prefer least-privilege DB users for runtime ingestion jobs. + +## Generating C++ API docs + +```bash +cmake --build build --target docs +``` + +Generated output goes to `docs/html/` and is ignored in version control. diff --git a/docs/SECURITY.md b/docs/SECURITY.md new file mode 100644 index 0000000..ec71ad6 --- /dev/null +++ b/docs/SECURITY.md @@ -0,0 +1,27 @@ +# Security Checklist + +## Secrets handling + +- Never commit `.env` or any file containing credentials. +- Use `.env.example` for non-sensitive defaults only. +- Set DB credentials through environment variables. +- Rotate credentials if they have ever appeared in git history. + +## Database hardening + +- Use a dedicated runtime user with least required privileges. +- Keep administrative users separate from ingestion users. +- Restrict DB network access to trusted hosts/VPC/private network. +- Enable SSL/TLS for non-local database connections. + +## Publication readiness + +Before making the repository public: + +1. Confirm `git status` has no secret files staged. +2. Search for potential secret patterns: + - passwords + - API keys + - tokens +3. Verify `.gitignore` includes local secret files (`.env*`). +4. Regenerate credentials used during development. diff --git a/docs/SETUP.md b/docs/SETUP.md new file mode 100644 index 0000000..01f347a --- /dev/null +++ b/docs/SETUP.md @@ -0,0 +1,60 @@ +# Setup Guide + +This guide describes a clean local setup for development and reproducible runs. + +## Prerequisites + +- Python 3.10+ +- CMake 3.16+ +- A C++20 compiler +- PostgreSQL 14+ (or Docker) +- On macOS, Homebrew packages for C++ DB support: + - `libpq` + - `libpqxx` + - `eigen` + - `pybind11` + +## Python dependencies + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install --upgrade pip +pip install -e . +pip install pandas yfinance sqlalchemy psycopg2-binary matplotlib scipy +``` + +## Environment configuration + +```bash +cp .env.example .env +``` + +Edit `.env` and set: + +- `DB_HOST`, `DB_PORT`, `DB_NAME`, `DB_USER`, `DB_PASSWORD` +- `PIPELINE_SYMBOLS` +- admin credentials used only by setup script (`POSTGRES_ADMIN_*`) + +## Database bootstrap + +```bash +source .env +python scripts/setup_postgres.py +``` + +The script is idempotent and safe to rerun. + +## Build and test C++ + +```bash +cmake -S . -B build +cmake --build build -j +ctest --test-dir build --output-on-failure +``` + +## Generate Doxygen docs + +```bash +cmake --build build --target docs +``` diff --git a/standalone_numerical_experiments/local_volatility_instability/INDEPENDENT_STANDALONE.txt b/standalone_numerical_experiments/local_volatility_instability/INDEPENDENT_STANDALONE.txt new file mode 100644 index 0000000..58fc565 --- /dev/null +++ b/standalone_numerical_experiments/local_volatility_instability/INDEPENDENT_STANDALONE.txt @@ -0,0 +1,6 @@ +This folder is intentionally self-contained. + +- No imports from the parent option_pricing package (no qengine, src/, cpp bindings). +- Third-party dependencies: numpy, matplotlib (see requirements.txt). +- Run: python run_experiment.py [--out lv_rmse.png] +- Safe to copy elsewhere or run in isolation. diff --git a/standalone_numerical_experiments/local_volatility_instability/figures/lv_relerr.png b/standalone_numerical_experiments/local_volatility_instability/figures/lv_relerr.png new file mode 100644 index 0000000..9ef14f3 Binary files /dev/null and b/standalone_numerical_experiments/local_volatility_instability/figures/lv_relerr.png differ diff --git a/standalone_numerical_experiments/local_volatility_instability/figures/lv_rmse.png b/standalone_numerical_experiments/local_volatility_instability/figures/lv_rmse.png new file mode 100644 index 0000000..e617d33 Binary files /dev/null and b/standalone_numerical_experiments/local_volatility_instability/figures/lv_rmse.png differ diff --git a/standalone_numerical_experiments/local_volatility_instability/figures/lv_sigma2.png b/standalone_numerical_experiments/local_volatility_instability/figures/lv_sigma2.png new file mode 100644 index 0000000..e525355 Binary files /dev/null and b/standalone_numerical_experiments/local_volatility_instability/figures/lv_sigma2.png differ diff --git a/standalone_numerical_experiments/local_volatility_instability/gatheral_local_vol.py b/standalone_numerical_experiments/local_volatility_instability/gatheral_local_vol.py new file mode 100644 index 0000000..19db847 --- /dev/null +++ b/standalone_numerical_experiments/local_volatility_instability/gatheral_local_vol.py @@ -0,0 +1,108 @@ +""" +Gatheral local variance in total-variance / log-moneyness form (practitioner's guide). + +sigma^2 = (d_T w) / ( 1 - (y/w) d_y w + + (1/4)(-1/4 - 1/w + y^2/w^2) (d_y w)^2 + + (1/2) d_yy w ) + +where w = omega is total implied variance, y is log-moneyness (convention as in the note). +""" + +from __future__ import annotations + +import numpy as np + + +def local_variance_from_derivatives( + y: np.ndarray, + w: np.ndarray, + dy_w: np.ndarray, + dyy_w: np.ndarray, + dT_w: np.ndarray, + *, + eps: float = 1e-14, +) -> np.ndarray: + """Vectorized Gatheral formula. Invalid / near-singular points become nan.""" + y = np.asarray(y, dtype=float) + w = np.asarray(w, dtype=float) + dy_w = np.asarray(dy_w, dtype=float) + dyy_w = np.asarray(dyy_w, dtype=float) + dT_w = np.asarray(dT_w, dtype=float) + + out = np.full_like(y, np.nan, dtype=float) + ok = np.isfinite(w) & (np.abs(w) > eps) & np.isfinite(dy_w) & np.isfinite(dyy_w) & np.isfinite(dT_w) + + denom = np.empty_like(w) + denom[ok] = ( + 1.0 + - (y[ok] / w[ok]) * dy_w[ok] + + 0.25 * (-0.25 - 1.0 / w[ok] + (y[ok] ** 2) / (w[ok] ** 2)) * (dy_w[ok] ** 2) + + 0.5 * dyy_w[ok] + ) + + ok2 = ok & (np.abs(denom) > eps) + out[ok2] = dT_w[ok2] / denom[ok2] + return out + + +def quadratic_total_variance( + y: np.ndarray, + alpha: float, + beta: float, + gamma: float, + T: float, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """ + w(y,T) = T * (alpha + beta*y + gamma*y^2), with derivatives as in the note: + + d_T w = alpha + beta*y + gamma*y^2 + d_y w = T * (beta + 2*gamma*y) + d_yy w = 2*gamma*T + """ + y = np.asarray(y, dtype=float) + f = alpha + beta * y + gamma * y ** 2 + w = T * f + dT_w = f + dy_w = T * (beta + 2.0 * gamma * y) + dyy_w = np.full_like(y, 2.0 * gamma * T) + return w, dT_w, dy_w, dyy_w + + +def analytic_local_variance_quadratic( + y: np.ndarray, + alpha: float, + beta: float, + gamma: float, + T: float, +) -> np.ndarray: + """Closed form from the note (equivalent to plugging derivatives into Gatheral).""" + y = np.asarray(y, dtype=float) + w, dT_w, dy_w, dyy_w = quadratic_total_variance(y, alpha, beta, gamma, T) + return local_variance_from_derivatives(y, w, dy_w, dyy_w, dT_w) + + +def central_first_derivative_uniform(w: np.ndarray, h: float) -> np.ndarray: + """Interior (w[i+1]-w[i-1])/(2h); endpoints nan.""" + w = np.asarray(w, dtype=float) + out = np.full_like(w, np.nan) + out[1:-1] = (w[2:] - w[:-2]) / (2.0 * h) + return out + + +def second_derivative_uniform(w: np.ndarray, h: float) -> np.ndarray: + """Interior second difference / h^2; endpoints nan.""" + w = np.asarray(w, dtype=float) + out = np.full_like(w, np.nan) + out[1:-1] = (w[2:] - 2.0 * w[1:-1] + w[:-2]) / (h ** 2) + return out + + +def add_multiplicative_noise( + w: np.ndarray, + sigma_noise: float, + rng: np.random.Generator, +) -> np.ndarray: + """tilde w(y_i) = w(y_i) * (1 + eps), eps ~ N(0, sigma_noise^2).""" + w = np.asarray(w, dtype=float) + eps = rng.normal(0.0, sigma_noise, size=w.shape) + return w * (1.0 + eps) diff --git a/standalone_numerical_experiments/local_volatility_instability/lv_rmse.png b/standalone_numerical_experiments/local_volatility_instability/lv_rmse.png new file mode 100644 index 0000000..b2a75e7 Binary files /dev/null and b/standalone_numerical_experiments/local_volatility_instability/lv_rmse.png differ diff --git a/standalone_numerical_experiments/local_volatility_instability/requirements.txt b/standalone_numerical_experiments/local_volatility_instability/requirements.txt new file mode 100644 index 0000000..337f34a --- /dev/null +++ b/standalone_numerical_experiments/local_volatility_instability/requirements.txt @@ -0,0 +1,2 @@ +numpy>=1.20 +matplotlib>=3.5 diff --git a/standalone_numerical_experiments/local_volatility_instability/run_experiment.py b/standalone_numerical_experiments/local_volatility_instability/run_experiment.py new file mode 100644 index 0000000..cb6b3f2 --- /dev/null +++ b/standalone_numerical_experiments/local_volatility_instability/run_experiment.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +""" +Local-volatility instability experiment (Gatheral total variance in log-moneyness). + +We compare the analytic local variance σ²(y) from a quadratic total variance +w(y,T) = T(α + βy + γy²) to σ² reconstructed from a noisy discrete surface +w̃(y_i) = w(y_i)(1 + ε_i) using finite differences in y, for several levels of +multiplicative noise σ_noise. This script only produces the figure: RMSE of the +FD reconstruction vs σ_noise (log–log), with a y = σ reference line of slope 1. + +Dependencies: numpy, matplotlib only (see INDEPENDENT_STANDALONE.txt). +""" + +from __future__ import annotations + +import argparse +import os +import sys +from typing import Literal + +# Prevent accidental imports from the parent repository +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if _REPO_ROOT in sys.path: + sys.path.remove(_REPO_ROOT) + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np + +from gatheral_local_vol import ( + add_multiplicative_noise, + analytic_local_variance_quadratic, + central_first_derivative_uniform, + local_variance_from_derivatives, + quadratic_total_variance, + second_derivative_uniform, +) + +# --------------------------------------------------------------------------- +# Defaults (quadratic total variance, positive w on y ∈ [-0.5, 0.5]) +# --------------------------------------------------------------------------- + +ALPHA = 0.04 +BETA = 0.0 +GAMMA = 0.1 +T_MATURITY = 1.0 +Y_MIN = -0.5 +Y_MAX = 0.5 +N_GRID = 201 + + +def ensure_parent_dir(path: str) -> None: + parent = os.path.dirname(os.path.abspath(path)) + if parent: + os.makedirs(parent, exist_ok=True) + + +def log_uniform_sigma_grid(n_points: int, sigma_min: float, sigma_max: float) -> np.ndarray: + """ + Return `n_points` values of σ_noise with log₁₀(σ) equally spaced. + + This is the correct sampling for a log–log RMSE plot; it is not linspace(σ_min, σ_max). + """ + n_points = max(4, n_points) + if sigma_min <= 0 or sigma_max <= 0 or sigma_max < sigma_min: + raise ValueError("Require 0 < sigma_min <= sigma_max.") + return np.logspace(np.log10(sigma_min), np.log10(sigma_max), n_points) + + +def relative_pointwise_error( + sigma2_analytic: np.ndarray, sigma2_fd: np.ndarray, eps: float = 1e-12 +) -> np.ndarray: + return (sigma2_fd - sigma2_analytic) / np.maximum(np.abs(sigma2_analytic), eps) + + +def rmse_absolute( + sigma2_analytic: np.ndarray, + sigma2_fd: np.ndarray, + interior: slice, +) -> float: + """RMSE of (σ²_FD − σ²_analytic) on interior indices.""" + sa = np.asarray(sigma2_analytic, dtype=float)[interior] + sf = np.asarray(sigma2_fd, dtype=float)[interior] + m = np.isfinite(sa) & np.isfinite(sf) + if not np.any(m): + return float("nan") + d = sf[m] - sa[m] + return float(np.sqrt(np.mean(d * d))) + + +def rmse_relative( + sigma2_analytic: np.ndarray, + sigma2_fd: np.ndarray, + interior: slice, + eps: float = 1e-12, +) -> float: + """RMSE over grid points of relative error (σ²_FD − σ²_analytic) / |σ²_analytic|.""" + re = relative_pointwise_error(sigma2_analytic, sigma2_fd, eps=eps)[interior] + m = np.isfinite(re) + if not np.any(m): + return float("nan") + return float(np.sqrt(np.mean(re[m] ** 2))) + + +def local_variance_one_draw( + y: np.ndarray, + h: float, + alpha: float, + beta: float, + gamma: float, + T: float, + sigma_noise: float, + rng: np.random.Generator, + dT_mode: Literal["exact", "noisy_ratio"], +) -> tuple[np.ndarray, np.ndarray]: + """One noisy surface and FD local variance; returns (σ²_analytic, σ²_FD).""" + w_true, dT_w_true, _, _ = quadratic_total_variance(y, alpha, beta, gamma, T) + sigma2_a = analytic_local_variance_quadratic(y, alpha, beta, gamma, T) + + w_tilde = add_multiplicative_noise(w_true, sigma_noise, rng) + dy = central_first_derivative_uniform(w_tilde, h) + dyy = second_derivative_uniform(w_tilde, h) + + if dT_mode == "exact": + dT = dT_w_true + elif dT_mode == "noisy_ratio": + dT = w_tilde / T + else: + raise ValueError(dT_mode) + + sigma2_fd = local_variance_from_derivatives(y, w_tilde, dy, dyy, dT) + return sigma2_a, sigma2_fd + + +def rmse_curves_averaged( + y: np.ndarray, + h: float, + alpha: float, + beta: float, + gamma: float, + T: float, + sigma_grid: np.ndarray, + rng: np.random.Generator, + dT_mode: Literal["exact", "noisy_ratio"], + interior: slice, + trials_per_sigma: int, +) -> tuple[np.ndarray, np.ndarray]: + """ + For each σ in `sigma_grid`, average RMSE (relative and absolute) over + `trials_per_sigma` independent noise draws. + """ + rel: list[float] = [] + abs_: list[float] = [] + trials_per_sigma = max(1, trials_per_sigma) + + for sig in sigma_grid: + tr: list[float] = [] + ta: list[float] = [] + for _ in range(trials_per_sigma): + sa, sf = local_variance_one_draw( + y, h, alpha, beta, gamma, T, float(sig), rng, dT_mode + ) + tr.append(rmse_relative(sa, sf, interior)) + ta.append(rmse_absolute(sa, sf, interior)) + rel.append(float(np.nanmean(tr))) + abs_.append(float(np.nanmean(ta))) + + return np.asarray(rel, dtype=float), np.asarray(abs_, dtype=float) + + +def plot_rmse_vs_noise( + sigma_grid: np.ndarray, + rmse_rel: np.ndarray, + rmse_abs: np.ndarray, + *, + h: float, + T: float, + dT_mode: str, + trials_per_sigma: int, +) -> mpl.figure.Figure: + """ + Log–log plot: RMSE (relative and absolute in σ²) vs σ_noise, reference y = σ. + """ + fig, ax = plt.subplots(figsize=(5.8, 3.8), constrained_layout=True) + + x = np.asarray(sigma_grid, dtype=float) + pos = x > 0 + n = len(x) + ms = 3.5 if n > 50 else 4.5 + + ax.loglog( + x[pos], + rmse_rel[pos], + "o-", + ms=ms, + lw=1.25, + label=r"RMSE of relative error $(\sigma^2_{\mathrm{FD}}-\sigma^2_{\mathrm{nat}})/|\sigma^2_{\mathrm{nat}}|$", + zorder=3, + ) + ax.loglog( + x[pos], + rmse_abs[pos], + "s--", + ms=ms - 1, + lw=1.0, + alpha=0.9, + label=r"RMSE of $\sigma^2$ error $|\sigma^2_{\mathrm{FD}}-\sigma^2_{\mathrm{nat}}|$", + zorder=2, + ) + + s_lo, s_hi = float(x[pos].min()), float(x[pos].max()) + ax.loglog([s_lo, s_hi], [s_lo, s_hi], ":", color="0.4", lw=2.0, zorder=1, label=r"reference slope 1: $y=\sigma_{\mathrm{noise}}$") + + ax.set_xlabel(r"$\sigma_{\mathrm{noise}}$ (multiplicative noise on $\tilde{w}$)") + ax.set_ylabel("RMSE (interior $y$)") + subtitle = f"$T={T}$, $h={h:.4f}$, $\\partial_T w$: {dT_mode}" + if trials_per_sigma > 1: + subtitle += f", mean over {trials_per_sigma} draws per $\\sigma$" + ax.set_title("FD local variance: RMSE vs noise\n" + subtitle, fontsize=10) + ax.grid(True, which="both", alpha=0.35) + ax.legend(loc="best", fontsize=8, framealpha=0.95) + + return fig + + +def configure_matplotlib_style() -> None: + """Conservative defaults suitable for print.""" + mpl.rcParams.update( + { + "figure.dpi": 120, + "savefig.dpi": 300, + "font.size": 10, + "axes.labelsize": 10, + "axes.titlesize": 10, + "legend.fontsize": 8, + "axes.grid": True, + } + ) + + +def main() -> None: + configure_matplotlib_style() + + parser = argparse.ArgumentParser( + description="RMSE of finite-difference local variance vs multiplicative noise (single figure).", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--seed", type=int, default=42, help="RNG seed.") + parser.add_argument( + "--out", + type=str, + default="lv_rmse.png", + help="Output image path.", + ) + parser.add_argument( + "--dT-mode", + choices=("exact", "noisy_ratio"), + default="exact", + help="Treatment of ∂_T w when w is replaced by noisy w̃ on the grid.", + ) + parser.add_argument("--rmse-points", type=int, default=35, help="Number of σ_noise values (log-uniform).") + parser.add_argument("--rmse-sigma-min", type=float, default=1e-5, help="Smallest σ_noise.") + parser.add_argument("--rmse-sigma-max", type=float, default=5e-4, help="Largest σ_noise.") + parser.add_argument( + "--rmse-trials", + type=int, + default=50, + help="Independent noisy surfaces per σ_noise; RMSE is averaged.", + ) + args = parser.parse_args() + + rng = np.random.default_rng(args.seed) + y = np.linspace(Y_MIN, Y_MAX, N_GRID) + h = float(y[1] - y[0]) + interior = slice(1, -1) + + sigma_grid = log_uniform_sigma_grid(args.rmse_points, args.rmse_sigma_min, args.rmse_sigma_max) + rmse_rel, rmse_abs = rmse_curves_averaged( + y, + h, + ALPHA, + BETA, + GAMMA, + T_MATURITY, + sigma_grid, + rng, + args.dT_mode, + interior, + args.rmse_trials, + ) + + fig = plot_rmse_vs_noise( + sigma_grid, + rmse_rel, + rmse_abs, + h=h, + T=T_MATURITY, + dT_mode=args.dT_mode, + trials_per_sigma=args.rmse_trials, + ) + + ensure_parent_dir(args.out) + fig.savefig(args.out, bbox_inches="tight") + print(f"Wrote {args.out}") + plt.close(fig) + + +if __name__ == "__main__": + main()