In [1]:
from scipy import stats
In [2]:
help(stats.linregress)
Help on function linregress in module scipy.stats._stats_mstats_common:

linregress(x, y=None)
    Calculate a linear least-squares regression for two sets of measurements.
    
    Parameters
    ----------
    x, y : array_like
        Two sets of measurements.  Both arrays should have the same length.  If
        only `x` is given (and ``y=None``), then it must be a two-dimensional
        array where one dimension has length 2.  The two sets of measurements
        are then found by splitting the array along the length-2 dimension.  In
        the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
        equivalent to ``linregress(x[0], x[1])``.
    
    Returns
    -------
    slope : float
        Slope of the regression line.
    intercept : float
        Intercept of the regression line.
    rvalue : float
        Correlation coefficient.
    pvalue : float
        Two-sided p-value for a hypothesis test whose null hypothesis is
        that the slope is zero, using Wald Test with t-distribution of
        the test statistic.
    stderr : float
        Standard error of the estimated gradient.
    
    See also
    --------
    :func:`scipy.optimize.curve_fit` : Use non-linear
     least squares to fit a function to data.
    :func:`scipy.optimize.leastsq` : Minimize the sum of
     squares of a set of equations.
    
    Notes
    -----
    Missing values are considered pair-wise: if a value is missing in `x`,
    the corresponding value in `y` is masked.
    
    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> from scipy import stats
    
    Generate some data:
    
    >>> np.random.seed(12345678)
    >>> x = np.random.random(10)
    >>> y = 1.6*x + np.random.random(10)
    
    Perform the linear regression:
    
    >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    >>> print("slope: %f    intercept: %f" % (slope, intercept))
    slope: 1.944864    intercept: 0.268578
    
    To get coefficient of determination (R-squared):
    
    >>> print("R-squared: %f" % r_value**2)
    R-squared: 0.735498
    
    Plot the data along with the fitted line:
    
    >>> plt.plot(x, y, 'o', label='original data')
    >>> plt.plot(x, intercept + slope*x, 'r', label='fitted line')
    >>> plt.legend()
    >>> plt.show()
    
    Example for the case where only x is provided as a 2x2 array:
    
    >>> x = np.array([[0, 1], [0, 2]])
    >>> r = stats.linregress(x)
    >>> r.slope, r.intercept
    (2.0, 0.0)

In [3]:
import pandas as pd
In [7]:
import yfinance as yf
In [8]:
spy_etf = yf.download('SPY')
[*********************100%***********************]  1 of 1 completed
In [9]:
spy_etf.head()
Out[9]:
Open High Low Close Adj Close Volume
Date
1993-01-29 43.96875 43.96875 43.75000 43.93750 26.184059 1003200
1993-02-01 43.96875 44.25000 43.96875 44.25000 26.370279 480500
1993-02-02 44.21875 44.37500 44.12500 44.34375 26.426161 201300
1993-02-03 44.40625 44.84375 44.37500 44.81250 26.705482 529400
1993-02-04 44.96875 45.09375 44.46875 45.00000 26.817228 531500
In [10]:
spy_etf.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6956 entries, 1993-01-29 to 2020-09-11
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       6956 non-null   float64
 1   High       6956 non-null   float64
 2   Low        6956 non-null   float64
 3   Close      6956 non-null   float64
 4   Adj Close  6956 non-null   float64
 5   Volume     6956 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 380.4 KB
In [23]:
spy_etf = spy_etf.loc["2010-01-01":].copy()
In [14]:
spy_etf.tail()
Out[14]:
Open High Low Close Adj Close Volume
Date
2020-09-04 346.130005 347.829987 334.869995 342.570007 342.570007 139156300
2020-09-08 336.709991 342.640015 332.880005 333.209991 333.209991 114465300
2020-09-09 337.549988 342.459991 336.609985 339.790009 339.790009 91462300
2020-09-10 341.820007 342.529999 332.850006 333.890015 333.890015 90569500
2020-09-11 335.820007 336.970001 331.000000 334.059998 334.059998 84628600
In [16]:
start = pd.to_datetime("2010-01-01")
end = pd.to_datetime("2020-09-11")
In [17]:
aapl = yf.download("aapl", start = start, end = end)
[*********************100%***********************]  1 of 1 completed
In [19]:
import matplotlib.pyplot as plt
%matplotlib inline
In [20]:
aapl.head()
Out[20]:
Open High Low Close Adj Close Volume
Date
2009-12-31 7.611786 7.619643 7.520000 7.526072 6.503574 352410800
2010-01-04 7.622500 7.660714 7.585000 7.643214 6.604801 493729600
2010-01-05 7.664286 7.699643 7.616071 7.656428 6.616219 601904800
2010-01-06 7.656428 7.686786 7.526786 7.534643 6.510980 552160000
2010-01-07 7.562500 7.571429 7.466072 7.520714 6.498945 477131200
In [21]:
aapl["Close"].plot(label = "aapl", figsize = (12,8))
spy_etf["Close"].plot(label = "SPY Index")
plt.legend()
Out[21]:
<matplotlib.legend.Legend at 0x164935e1910>
In [24]:
aapl["Cummulative"] = aapl["Close"]/ aapl["Close"].iloc[0]
spy_etf["Cummulative"] = spy_etf["Close"]/ spy_etf["Close"].iloc[0]
In [25]:
aapl["Cummulative"].plot(label = "aapl", figsize = (12,8))
spy_etf["Cummulative"].plot(label = "SPY Index")
plt.legend()
Out[25]:
<matplotlib.legend.Legend at 0x164936b9190>
In [27]:
aapl["Daily Return"] = aapl["Close"].pct_change(1)
spy_etf["Daily Return"] = spy_etf["Close"].pct_change(1)
In [28]:
plt.scatter(aapl["Daily Return"], spy_etf["Daily Return"], alpha = 0.25)
Out[28]:
<matplotlib.collections.PathCollection at 0x1649374dc70>
In [58]:
plt.hist(aapl["Daily Return"], bins = 100)
plt.tight_layout
Out[58]:
<function matplotlib.pyplot.tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None)>
In [37]:
beta, alpha, r_value, p_value, std_error = stats.linregress(aapl["Daily Return"].iloc[1:],
                                                           spy_etf["Daily Return"].iloc[1:])
In [38]:
alpha
Out[38]:
0.0005479343420684555
In [39]:
beta
Out[39]:
-0.07399874716174956
In [40]:
r_value
Out[40]:
-0.11997036058859331
In [41]:
p_value
Out[41]:
4.289474607310342e-10
In [42]:
std_error
Out[42]:
0.011808831238769782
In [43]:
import numpy as np
In [48]:
noise = np.random.normal(0, 0.001, len(spy_etf["Daily Return"].iloc[1:]))
In [49]:
fake_stock = spy_etf["Daily Return"].iloc[1:] + noise
In [50]:
plt.scatter(fake_stock, spy_etf["Daily Return"].iloc[1:], alpha = 0.25)
Out[50]:
<matplotlib.collections.PathCollection at 0x1649459f490>
In [51]:
beta, alpha, r_value, p_value, std_error = stats.linregress(fake_stock,
                                                           spy_etf["Daily Return"].iloc[1:]) 
In [52]:
beta
Out[52]:
0.9933476671209663
In [53]:
alpha
Out[53]:
-8.222065201963407e-06
In [54]:
r_value
Out[54]:
0.9957015435552179
In [ ]: