# Source code for starclass.features.freqextr

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract frequencies from timeseries.

.. codeauthor:: Kristine Kousholt Mikkelsen <201505068@post.au.dk>
.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
"""

import numpy as np
import logging
import itertools
from bottleneck import median
from scipy.stats import binned_statistic
from scipy.interpolate import interp1d
import astropy.units as u
from astropy.table import Table
from .powerspectrum import powerspectrum

#--------------------------------------------------------------------------------------------------
# TODO: Replace with ps.ls.model?

[docs]
def model(x, a, b, freq):
omegax = 0.1728 * np.pi * freq * x # Strange factor is 2 * 86400 * 1e-6
return a * np.sin(omegax) + b * np.cos(omegax)

#--------------------------------------------------------------------------------------------------

[docs]
def freqextr(lightcurve, n_peaks=6, n_harmonics=0, hifac=1, ofac=4, snrlim=None, snr_width=None,
faplim=1-0.9973, devlim=0.5, conseclim=10, harmonics_list=None, Noptimize=10, optim_max_diff=10,
initps=None):
r"""
Extract frequencies from timeseries.

The program will perform iterative sine-wave fitting (CLEAN or pre-whitening) using
a sum of harmonic functions of the following form:

.. math::
\sum_{i=1}^{N_\mathrm{peaks}} A_i \sin(2\pi\nu_i t + \delta_i)
= \sum_{i=1}^{N_\mathrm{peaks}} \alpha_i\sin(2\pi\nu_i t) + \beta_i\cos(2\pi\nu_i t) \, ,

where :math:\nu_i, :math:A_i and :math:\delta_i denoted the frequency, amplitude
and phase of the oscillation.

If n_harmonic is greater than zero, the routine will additionally for each extracted peak
extract peaks at the given number of harmonics for each peak. Default is to :math:2\nu_i,
:math:3\nu_i etc., but this can be controlled by the harmonics_list input.

At each iteration, an optimization loop is entered which will go back and re-optimize the
previously found peaks in an attempt at minimizing influences between close frequencies.
The details of this optimization can be controlled by the parameters Noptimize and
optim_max_diff.

Parameters:
lightcurve (:class:lightkurve.LightCurve): Lightcurve to extract frequencies for.
n_peaks (int, optional): Number of frequencies to extract.
n_harmonics (int, optional): Number of harmonics to extract for each frequency.
hifac (float, optional): Nyquist factor.
ofac (int, optional): Oversampling factor used for initial search for peaks
in power spectrum.
snrlim (float, optional): Limit on local signal-to-noise ratio above which peaks are
considered significant. If set to None no limit is enforced. Default is to not
enforce a limit.
snr_width (float, optional): Width in uHz around each peak to estimate signal-to-noise from.
Default is 15 frequency steps on either side of the peak.
faplim (float, optional): False alarm probability limit. Peaks with a f.a.p. below this
limit are considerd significant. If set to None no limit is enforced.
Default is 1-0.9973=0.0027.
devlim (float, optional): If set to None no limit is enforced. Default is 50%.
conseclim (int, optional): Stop after this number of consecutive failed peaks.
Default is 10.
Noptimize (int, optional): At each iteration re-optimize. If put to -1, all peaks will
be optimized at each iteration. Default is 10.
optim_max_diff (float, optional): Maximal difference in uHz between frequencies to be
optimized. Any frequencies futher away than this value from the extracted peak
will not be optimized in that iteration. If set to None no limit is enforced.
Default is 10 uHz. Please note that this does not take the spectral windowfunction
into account, so this value may have to	be increased in cases where the windowfunction
has significant side-lobes.
initps (:class:powerspectrum, optional): Initial powerspectrum. Should be a powerspectrum
calculated from the provided lightcurve. This can be provided if the powerspectrum
has already been calculated. If not provided, it is calculated from the provided
lightcurve.

Returns:
:class:astropy.table.Table: Table of extracted oscillations.

Note:
If the height of the peak of one of the harmonics are close to being insignificant,
the harmonic may not be found as an harmonic, but will be found later as a peak in it self.

.. codeauthor:: Kristine Kousholt Mikkelsen <201505068@post.au.dk>
.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
"""

logger = logging.getLogger(__name__)

# Default value for different parameters
# TODO: Add these as inputs
estimate_noise = True

if initps is not None and not isinstance(initps, powerspectrum):
raise ValueError("Initial powerspectrum is invalid")

if Noptimize is None:
Noptimize = 0

# If no list of harmonics is given, do the simple one:
if harmonics_list is None:
harmonics_list = np.arange(2, n_harmonics+2)
elif len(harmonics_list) < n_harmonics:
raise ValueError("List of harmonics is too short")

# Constants:
power_median_to_mean = (1 - 1/9)**-3
mean_noise = 1

# Store original lightcurve and powerspectrum for later use:
original_lightcurve = lightcurve.copy()
if initps is None:
original_ps = powerspectrum(original_lightcurve)
else:
original_ps = initps
f_max = original_ps.nyquist*hifac*1e6
df = original_ps.df*1e6

# Defaults that depend on the power spectrum parameters:
if snr_width is None:
snr_width = 15*df

# Create lists for frequencies, alpha, beta and deviations:
# Create as 2D array, which as main-frequency number for rows, and harmonic number for columns.
nu = np.full((n_peaks, n_harmonics+1), np.nan)
alpha = np.full((n_peaks, n_harmonics+1), np.nan)
beta = np.full((n_peaks, n_harmonics+1), np.nan)
deviation = np.full((n_peaks, n_harmonics+1), np.nan)

# The first powerspectrum has already been calculated:
ps = original_ps.copy()

for i in range(n_peaks):
logger.debug("-"*72)

# Calculate the powerspectrum and find the index of the largest power value
if i > 0:
ps = powerspectrum(lightcurve)
frequency, power = ps.powerspectrum(oversampling=ofac, nyquist_factor=hifac, scale='power')

# Estimate a frequency-dependent noise-floor by binning the power spectrum.
if estimate_noise:
# Create bins to estimate noise level in:
#bins = np.logspace(np.floor(np.log10(df)), np.ceil(np.log10(f_max)), 20)
bins = np.linspace(df, f_max, 20)

# Calculate the median in the bins.
# Make sure we have at least 20 frequencies in each bin,
# otherwise combine adjacent bins until this is the case:
for _ in range(100):
mean_noise, bins, binindx = binned_statistic(frequency, power, bins=bins, statistic=median)

redo = False
for k, num in enumerate(np.bincount(binindx)):
if num < 20:
bins = np.delete(bins, k)
redo = True
break

if not redo:
break

bins = bins[:-1] + 0.5*(bins[1:] - bins[:-1])

indx = np.isfinite(mean_noise)
if np.sum(indx) > 2:
mean_noise_func = interp1d(bins[indx], mean_noise[indx], kind='linear', fill_value='extrapolate', assume_sorted=True)
mean_noise = power_median_to_mean * mean_noise_func(frequency)
mean_noise = np.clip(mean_noise, 0, None)
mean_noise += 1 # Add one to avoid DivideByZero errors - only used for finding max
else:
mean_noise = 1

#plt.figure()
#plt.plot(frequency, np.sqrt(power/mean_noise), 'k-', lw=0.5)
#plt.plot(frequency, power, 'b')
#plt.plot(frequency, mean_noise,'k-')
#plt.title(i)
#plt.show()

# Finds the frequency of the largest peak:
pmax_index = np.argmax(power / mean_noise)
fsearch = frequency[pmax_index]
if pmax_index > 0 and pmax_index < len(power)-1:
fsearch = [frequency[pmax_index-1], fsearch, frequency[pmax_index+1]]
nu[i,0] = ps.optimize_peak(fsearch)
alpha[i,0], beta[i,0] = ps.alpha_beta(nu[i,0])
logger.debug('Fundamental frequency: %f', nu[i,0])

# Stop if significance becomes too low (lombscargle significance)
if faplim is not None:
FAP = ps.false_alarm_probability(nu[i,0])
if FAP > faplim:
logger.debug("Stopped from FAP")
nu[i,0] = np.nan
alpha[i,0] = np.nan
beta[i,0] = np.nan
break

# Stop if significance becomes too low (SNR ratio)
if snrlim is not None:
# Calculate SNR by estimating noise level locally around peak:
# TODO: Subtract peak first?
noise = np.sqrt(power_median_to_mean * median(power[(frequency > (nu[i,0] - snr_width)) & (frequency < (nu[i,0] + snr_width))]))
amp = np.sqrt(alpha[i,0]**2 + beta[i,0]**2)
snr = amp / noise
logger.debug("SNR: %f", snr)

#plt.figure()
#plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5)
#plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5)
#plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go')
#plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude'), 'ro')
#plt.axhline(noise)
#plt.axvline(nu[i,0] - snr_width)
#plt.axvline(nu[i,0] + snr_width)

if snr < snrlim:
logger.debug("Stopped from SNR")
nu[i,0] = np.nan
alpha[i,0] = np.nan
beta[i,0] = np.nan
break

# Check how the extracted peak compares with the original powerspectrum
if devlim is not None:
atemp, btemp = original_ps.alpha_beta(nu[i,0])
deviation[i,0] = (alpha[i,0]**2 + beta[i,0]**2) / (atemp**2 + btemp**2)

# Stops if there are to many consecutive failed peaks
if devlim is not None and conseclim is not None:
# Stop numpy from warning us that deviation contains NaN
with np.errstate(invalid='ignore'):
deviation_large = (deviation > 1/devlim) | (deviation < devlim)
if np.all( deviation_large[max(i-conseclim, 0):(i+1), 0] ): # Only checking main peaks right now!
logger.debug('Stopped due to too many consecutive failed peaks')
break

# Removes the largest peak from the data:
lightcurve -= model(lightcurve.time, alpha[i,0], beta[i,0], nu[i,0])

# Loop through all harmonics:
for h in range(1, n_harmonics+1):
n_harmonic = harmonics_list[h-1]
# Don't find harmonics outside frequency range:
if n_harmonic*nu[i,0] > f_max:
break

# Updates the flux and optimize to find the correct frequency
ps = powerspectrum(lightcurve)

# Checks the significance of the harmonics. If it is too low NaN is returned in amplitude, frequency and phase for the given harmonic
nu[i,h] = ps.optimize_peak(n_harmonic*nu[i,0])

# Stop if significance becomes too low (lombscargle significance)
if faplim is not None:
FAP = ps.false_alarm_probability(nu[i,h])
logger.debug('harmonic %d: %f %f', h, nu[i,h], FAP)
if FAP > faplim:
logger.debug("Harmonic rejected from FAP")
nu[i,h] = np.nan
alpha[i,h] = np.nan
beta[i,h] = np.nan
continue

# Stop if significance becomes too low (SNR ratio):
if snrlim is not None:
# Calculate SNR by estimating noise level locally around peak:
# TODO: Subtract peak first?
noise = np.sqrt(power_median_to_mean * median(power[(frequency > (nu[i,0] - snr_width)) & (frequency < (nu[i,0] + snr_width))]))
amp = np.sqrt(alpha[i,0]**2 + beta[i,0]**2)
snr = amp / noise
logger.debug("SNR: %f", snr)

#plt.figure()
#plt.plot(frequency, np.sqrt(power), 'k-', lw=0.5)
#plt.plot(frequency, np.sqrt(mean_noise), 'r-', lw=0.5)
#plt.plot(frequency[pmax_index], np.sqrt(power[pmax_index]), 'go')
#plt.plot(nu[i,0], ps.powerspectrum(nu[i,0]*1e-6, scale='amplitude'), 'ro')
#plt.axhline(noise)
#plt.axvline(nu[i,0] - snr_width)
#plt.axvline(nu[i,0] + snr_width)

if snr < snrlim:
logger.debug("Stopped from SNR")
nu[i,0] = np.nan
alpha[i,0] = np.nan
beta[i,0] = np.nan
break

# Removes the harmonic peak from the data:
alpha[i,h], beta[i,h] = ps.alpha_beta(nu[i,h])
lightcurve -= model(lightcurve.time, alpha[i,h], beta[i,h], nu[i,h])

# Check how the extracted peak compares with the original powerspectrum and stops
# if there are to many consecutive failed peaks
if devlim is not None:
atemp, btemp = original_ps.alpha_beta(nu[i,h])
deviation[i,h] = (alpha[i,h]**2 + beta[i,h]**2) / (atemp**2 + btemp**2)

# Optimize the Noptimize nearest peaks
if i != 0 and Noptimize != 0:
for h in range(n_harmonics+1):

# Sort to find nearest frequencies to optimize
Nopt = Noptimize + 1
if (i+1)*(n_harmonics+1) < Nopt or Noptimize == -1:
Nopt = (i+1)*(n_harmonics+1)

nusort = np.abs(nu - nu[i,h])
nusort = nusort.ravel()
order = np.argsort(nusort) # sort nusort and find the list of indexes

# Create an index of which peaks should be optimized:
indx_optim = np.zeros_like(order, dtype='bool')
indx_optim[1:Nopt] = True

# Only optimize a peak if it is closer than the set limit.
# NOTE: Be careful as this doesn't take the window function into account
if optim_max_diff is not None:
with np.errstate(invalid='ignore'):
indx_optim &= (nusort[order] < optim_max_diff)

# Pick out the peaks that should be optimized:
order = order[indx_optim]
order = list(zip(*np.unravel_index(order, (n_peaks, n_harmonics+1))))
logger.debug("Optimizing %d peaks", len(order))

for j in order:
if np.isfinite(alpha[j]): # and deviation[j] < 1/devlim and deviation[j] > devlim:
lightcurve += model(lightcurve.time, alpha[j], beta[j], nu[j])
ps = powerspectrum(lightcurve)

# Find the frequency of maximum power and find alpha and beta again
nu[j] = ps.optimize_peak(nu[j])
alpha[j], beta[j] = ps.alpha_beta(nu[j])

# Recalculate the deviation
if devlim is not None:
atemp, btemp = original_ps.alpha_beta(nu[j])
deviation[j] = (alpha[j]**2 + beta[j]**2)/(atemp**2 + btemp**2)

# Remove the oscillation again:
lightcurve -= model(lightcurve.time, alpha[j], beta[j], nu[j])

# Remove anything that in the end was marked with a large deviation:
if devlim is not None:
for i in range(n_peaks):
if deviation[i,0] > 1/devlim or deviation[i,0] < devlim:
# If main peak is rejected, then also reject all harmonics
nu[i,:] = np.nan
alpha[i,:] = np.nan
beta[i,:] = np.nan
else:
for j in range(1, n_harmonics+1):
if deviation[i,j] > 1/devlim or deviation[i,j] < devlim:
nu[i,j] = np.nan
alpha[i,j] = np.nan
beta[i,j] = np.nan

# Calculate amplitude and phase from alpha and beta:
amp = np.sqrt(alpha**2 + beta**2)
phase = np.arctan2(beta, alpha)

# Make sure the found peaks are ordered by the amplitude of the main peak:
amp[np.isnan(amp)] = -np.inf
indx = np.argsort(amp[:,0])[::-1]
nu = nu[indx,:]
amp = amp[indx,:]
phase = phase[indx,:]
alpha = alpha[indx,:]
beta = beta[indx,:]
deviation = deviation[indx,:]
amp[~np.isfinite(amp)] = np.nan

# Gather into table:
num, harmonic = np.meshgrid(range(1, n_peaks+1), range(n_harmonics+1))
tab = Table(
data=[
num.flatten(order='F'),
harmonic.flatten(order='F'),
nu.flatten(),
amp.flatten(),
phase.flatten(),
alpha.flatten(),
beta.flatten(),
deviation.flatten()
],
names=['num', 'harmonic', 'frequency', 'amplitude', 'phase', 'alpha', 'beta', 'deviation'],
dtype=['int32', 'int32', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64'])

tab['frequency'].unit = u.uHz
tab['amplitude'].unit = lightcurve.flux_unit
tab['alpha'].unit = lightcurve.flux_unit
tab['beta'].unit = lightcurve.flux_unit

# Add index to peak number and harmonic for easy lookup:
# TODO: Use table indicies - Problem with Pickle

# Add meta data to table on how the list was created:
tab.meta['n_peaks'] = n_peaks
tab.meta['n_harmonics'] = n_harmonics
tab.meta['harmonics_list'] = harmonics_list
tab.meta['hifac'] = hifac
tab.meta['ofac'] = ofac
tab.meta['snrlim'] = snrlim
tab.meta['snr_width'] = snr_width * u.uHz
tab.meta['faplim'] = faplim
tab.meta['devlim'] = devlim
tab.meta['conseclim'] = conseclim

return tab

#--------------------------------------------------------------------------------------------------

[docs]
def freqextr_table_from_dict(feat, n_peaks=None, n_harmonics=None, flux_unit=None):
"""
Reconstruct freqextr table from features dict.

Please note that not all information can be recreated from features dictionaries.
The deviation column will be all NaN, since it can not be recreated, and
only n_peaks and n_harmonics will be available in the meta-information.

Parameters:
feat (dict): Dictionary of features.
n_peaks (int): If not provided, it will be determined from feat.
n_harmonics (int): If not provided, it will be determined from feat.
flux_unit (:class:astropy.units.Unit): Unit to put on the amplitude, alpha and beta columns.

Returns:
:class:astropy.table.Table:

.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
"""

if n_peaks is None:
n_peaks = 0
while True:
n_peaks += 1
if 'freq{0:d}'.format(n_peaks) not in feat:
n_peaks -= 1
break

if n_harmonics is None:
n_harmonics = 0
while True:
n_harmonics += 1
if 'freq1_harmonic{0:d}'.format(n_harmonics) not in feat:
n_harmonics -= 1
break

rows = []
for num, harmonic in itertools.product(range(1, n_peaks+1), range(n_harmonics+1)):
if harmonic == 0:
key = '{0:d}'.format(num)
else:
key = '{0:d}_harmonic{1:d}'.format(num, harmonic)

amp = feat.get('amp' + key, np.NaN)
freq = feat.get('freq' + key, np.NaN)
phase = feat.get('phase' + key, np.NaN)
rows.append([
num,
harmonic,
freq,
amp,
phase,
amp*np.cos(phase),
amp*np.sin(phase),
np.NaN # There is no way to recover this information
])

tab = Table(
rows=rows,
names=['num', 'harmonic', 'frequency', 'amplitude', 'phase', 'alpha', 'beta', 'deviation'],
dtype=['int32', 'int32', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64'])

tab['frequency'].unit = u.uHz
tab['amplitude'].unit = flux_unit
tab['alpha'].unit = flux_unit
tab['beta'].unit = flux_unit

# Add meta data to table on how the list was created:
tab.meta['n_peaks'] = n_peaks
tab.meta['n_harmonics'] = n_harmonics
#tab.meta['harmonics_list'] = harmonics_list
#tab.meta['hifac'] = hifac
#tab.meta['ofac'] = ofac
#tab.meta['snrlim'] = snrlim
#tab.meta['snr_width'] = snr_width * u.uHz
#tab.meta['faplim'] = faplim
#tab.meta['devlim'] = devlim
#tab.meta['conseclim'] = conseclim

return tab

#--------------------------------------------------------------------------------------------------

[docs]
def freqextr_table_to_dict(tab):
"""
Convert freqextr output table to features dictionary.

This will return a dictionary with freq, amp, and phase keys.
Please not that this operation does not conserve all information from the table.

Parameters:
tab (:class:astropy.table.Table): Table extracted by :func:freqextr.

Returns:
dict: Dictionary with frequencies, amplitudes and phases.

.. codeauthor:: Rasmus Handberg <rasmush@phys.au.dk>
"""
features = {}
for row in tab:
if row['harmonic'] == 0:
key = '{0:d}'.format(row['num'])
else:
key = '{0:d}_harmonic{1:d}'.format(row['num'], row['harmonic'])

features['freq' + key] = row['frequency']
features['amp' + key] = row['amplitude']
features['phase' + key] = row['phase']

return features