Spaces:

links-ads
/

news_time_series_anomaly_detection

Running

App Files Files Community

JuanJoseMV commited on May 14

Commit

9485251

1 Parent(s): 0d5a7ab

add methods for each strategy

Browse files

Files changed (17) hide show

.gitignore +4 -1
app.py +54 -8
examples/mpox.csv +299 -0
models/lstm_forec_40_11_06.pth +3 -0
requirements.txt +4 -1
sections/try_it_yourself.html +46 -1
src/__init__.py +24 -0
src/anomaly_detection.py +51 -0
src/outbreak_detection/__init__.py +12 -0
src/outbreak_detection/arima.py +111 -0
src/outbreak_detection/iqr.py +64 -0
src/outbreak_detection/lstm.py +191 -0
src/outbreak_detection/lstm_model.py +83 -0
src/plotting/__init__.py +3 -0
src/plotting/visualization.py +44 -0
src/utils.py +82 -0
src/visualization.py +0 -20

.gitignore CHANGED Viewed

@@ -16,4 +16,7 @@ build/
 # VSCode
 .vscode/
-*.code-workspace

 # VSCode
 .vscode/
+*.code-workspace
+# Solution template
+solution_template/

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import gradio as gr
-from src.visualization import plot_time_series
 PROJECT_HTML_PATH = "sections/project_description.html"
 WELCOME_HTML_PATH = "sections/welcome_section.html"
@@ -26,11 +29,48 @@ with gr.Blocks(css=blocks_css) as demo:
     gr.Markdown(demo_section_html)
     gr.HTML(try_it_yourself_html)
-    file_input = gr.File(label="Upload Time Series CSV")
     plot_btn = gr.Button("Plot Time Series")
     plot_output = gr.Plot(label="Time Series Plot")
-    method_input = gr.Dropdown(choices=ANOMALY_METHODS, label="Select Anomaly Detection Method", value="LSTM")
     analyze_btn = gr.Button("Detect Anomalies")
     anomaly_output = gr.Plot(label="Anomaly Detection Results")
@@ -40,10 +80,16 @@ with gr.Blocks(css=blocks_css) as demo:
         outputs=[plot_output]
     )
-    # analyze_btn.click(
-    #     fn=detect_anomalies,
-    #     inputs=[file_input, method_input],
-    #     outputs=[anomaly_output]
-    # )
 demo.launch(show_api=False)

 import gradio as gr
+from src import plot_time_series
+from src import detect_anomalies
+from src.utils import update_controls
 PROJECT_HTML_PATH = "sections/project_description.html"
 WELCOME_HTML_PATH = "sections/welcome_section.html"
     gr.Markdown(demo_section_html)
     gr.HTML(try_it_yourself_html)
+    file_input = gr.File(
+        label="Upload Time Series CSV",
+        file_types=[".csv"],
+        value="examples/mpox.csv"  # Set default example file
+    )
     plot_btn = gr.Button("Plot Time Series")
     plot_output = gr.Plot(label="Time Series Plot")
+    with gr.Row():
+        method_input = gr.Dropdown(
+            choices=ANOMALY_METHODS,
+            interactive=True,
+            label="Select Anomaly Detection Method",
+            value="LSTM"
+        )
+        k_input = gr.Slider(
+            minimum=1,
+            maximum=3,
+            step=0.1,
+            label="k",
+            value=1.5
+        )
+        percentile_input = gr.Slider(
+            minimum=0,
+            maximum=100,
+            step=1,
+            label="Percentile",
+            value=95,
+            interactive=True
+        )
+        threshold_method_input = gr.Dropdown(
+            choices=[
+                "IQR on (ground truth - forecast)",
+                "IQR on |ground truth - forecast|",
+                "IQR on |ground truth - forecast|/forecast",
+                "Percentile threshold on absolute loss",
+                "Percentile threshold on raw loss"
+            ],
+            label="Threshold Method",
+            value="IQR on (ground truth - forecast)",
+            interactive=True
+        )
     analyze_btn = gr.Button("Detect Anomalies")
     anomaly_output = gr.Plot(label="Anomaly Detection Results")
         outputs=[plot_output]
     )
+    method_input.change(
+        fn=update_controls,
+        inputs=[method_input],
+        outputs=[percentile_input, threshold_method_input]
+    )
+    analyze_btn.click(
+        fn=detect_anomalies,
+        inputs=[file_input, method_input, k_input, percentile_input, threshold_method_input],
+        outputs=[anomaly_output]
+    )
 demo.launch(show_api=False)

examples/mpox.csv ADDED Viewed

	@@ -0,0 +1,299 @@

+date,news
+2022-04-22,15.0
+2022-04-23,6.0
+2022-04-24,3.0
+2022-04-25,8.0
+2022-04-26,15.0
+2022-04-27,19.0
+2022-04-28,16.0
+2022-04-29,12.0
+2022-04-30,5.0
+2022-05-01,5.0
+2022-05-02,8.0
+2022-05-03,6.0
+2022-05-04,11.0
+2022-05-05,4.0
+2022-05-06,9.0
+2022-05-07,8.0
+2022-05-08,3.0
+2022-05-09,15.0
+2022-05-10,18.0
+2022-05-11,8.0
+2022-05-12,15.0
+2022-05-13,11.0
+2022-05-14,7.0
+2022-05-15,10.0
+2022-05-16,12.0
+2022-05-17,25.0
+2022-05-18,45.0
+2022-05-19,163.0
+2022-05-20,271.0
+2022-05-21,146.0
+2022-05-22,152.0
+2022-05-23,308.0
+2022-05-24,259.0
+2022-05-25,174.0
+2022-05-26,163.0
+2022-05-27,116.0
+2022-05-28,64.0
+2022-05-29,29.0
+2022-05-30,80.0
+2022-05-31,75.0
+2022-06-01,74.0
+2022-06-02,79.0
+2022-06-03,76.0
+2022-06-04,57.0
+2022-06-05,24.0
+2022-06-06,62.0
+2022-06-07,72.0
+2022-06-08,74.0
+2022-06-09,75.0
+2022-06-10,77.0
+2022-06-11,31.0
+2022-06-12,18.0
+2022-06-13,42.0
+2022-06-14,52.0
+2022-06-15,94.0
+2022-06-16,66.0
+2022-06-17,44.0
+2022-06-18,15.0
+2022-06-19,22.0
+2022-06-20,24.0
+2022-06-21,30.0
+2022-06-22,39.0
+2022-06-23,71.0
+2022-06-24,50.0
+2022-06-25,39.0
+2022-06-26,51.0
+2022-06-27,36.0
+2022-06-28,45.0
+2022-06-29,79.0
+2022-06-30,57.0
+2022-07-01,73.0
+2022-07-02,42.0
+2022-07-03,13.0
+2022-07-04,23.0
+2022-07-05,38.0
+2022-07-06,47.0
+2022-07-07,60.0
+2022-07-08,63.0
+2022-07-09,53.0
+2022-07-10,15.0
+2022-07-11,39.0
+2022-07-12,62.0
+2022-07-13,53.0
+2022-07-14,41.0
+2022-07-15,72.0
+2022-07-16,35.0
+2022-07-17,25.0
+2022-07-18,56.0
+2022-07-19,50.0
+2022-07-20,45.0
+2022-07-21,88.0
+2022-07-22,73.0
+2022-07-23,112.0
+2022-07-24,128.0
+2022-07-25,166.0
+2022-07-26,169.0
+2022-07-27,173.0
+2022-07-28,161.0
+2022-07-29,130.0
+2022-07-30,136.0
+2022-07-31,87.0
+2022-08-01,136.0
+2022-08-02,189.0
+2022-08-03,138.0
+2022-08-04,163.0
+2022-08-05,167.0
+2022-08-06,66.0
+2022-08-07,37.0
+2022-08-08,49.0
+2022-08-09,91.0
+2022-08-10,112.0
+2022-08-11,92.0
+2022-08-12,66.0
+2022-08-13,41.0
+2022-08-14,16.0
+2022-08-15,55.0
+2022-08-16,93.0
+2022-08-17,90.0
+2022-08-18,90.0
+2022-08-19,74.0
+2022-08-20,43.0
+2022-08-21,34.0
+2022-08-22,36.0
+2022-08-23,75.0
+2022-08-24,59.0
+2022-08-25,74.0
+2022-08-26,60.0
+2022-08-27,32.0
+2022-08-28,13.0
+2022-08-29,19.0
+2022-08-30,59.0
+2022-08-31,36.0
+2022-09-01,36.0
+2022-09-02,46.0
+2022-09-03,15.0
+2022-09-04,7.0
+2022-09-05,9.0
+2022-09-06,30.0
+2022-09-07,37.0
+2022-09-08,47.0
+2022-09-09,39.0
+2022-09-10,10.0
+2022-09-11,12.0
+2022-09-12,21.0
+2022-09-13,24.0
+2022-09-14,32.0
+2022-09-15,19.0
+2022-09-16,27.0
+2022-09-17,11.0
+2022-09-18,8.0
+2022-09-19,34.0
+2022-09-20,24.0
+2022-09-21,12.0
+2022-09-22,18.0
+2022-09-23,14.0
+2022-09-24,1.0
+2022-09-25,3.0
+2022-09-26,5.0
+2022-09-27,10.0
+2022-09-28,13.0
+2022-09-29,16.0
+2022-09-30,12.0
+2022-10-01,9.0
+2022-10-02,3.0
+2022-10-03,9.0
+2022-10-04,9.0
+2022-10-05,9.0
+2022-10-06,7.0
+2022-10-07,3.0
+2022-10-08,4.0
+2022-10-09,3.0
+2022-10-10,3.0
+2022-10-11,7.0
+2022-10-12,15.0
+2022-10-13,22.0
+2022-10-14,13.0
+2022-10-15,4.0
+2022-10-16,2.0
+2022-10-17,14.0
+2022-10-18,12.0
+2022-10-19,10.0
+2022-10-20,11.0
+2022-10-21,10.0
+2022-10-22,7.0
+2022-10-23,6.0
+2022-10-24,7.0
+2022-10-25,4.0
+2022-10-26,8.0
+2022-10-27,10.0
+2022-10-28,15.0
+2022-10-29,5.0
+2022-10-30,4.0
+2022-10-31,15.0
+2022-11-01,7.0
+2022-11-02,8.0
+2022-11-03,6.0
+2022-11-04,10.0
+2022-11-05,3.0
+2022-11-06,3.0
+2022-11-07,9.0
+2022-11-08,7.0
+2022-11-09,3.0
+2022-11-10,4.0
+2022-11-11,2.0
+2022-11-12,3.0
+2022-11-13,3.0
+2022-11-14,8.0
+2022-11-15,7.0
+2022-11-16,5.0
+2022-11-17,6.0
+2022-11-18,2.0
+2022-11-19,2.0
+2022-11-20,0.0
+2022-11-21,0.0
+2022-11-22,4.0
+2022-11-23,7.0
+2022-11-24,11.0
+2022-11-25,1.0
+2022-11-26,3.0
+2022-11-27,0.0
+2022-11-28,14.0
+2022-11-29,16.0
+2022-11-30,4.0
+2022-12-01,11.0
+2022-12-02,5.0
+2022-12-03,2.0
+2022-12-04,1.0
+2022-12-05,6.0
+2022-12-06,6.0
+2022-12-07,8.0
+2022-12-08,7.0
+2022-12-09,7.0
+2022-12-10,3.0
+2022-12-11,3.0
+2022-12-12,4.0
+2022-12-13,5.0
+2022-12-14,6.0
+2022-12-15,2.0
+2022-12-16,4.0
+2022-12-17,3.0
+2022-12-18,2.0
+2022-12-19,2.0
+2022-12-20,4.0
+2022-12-21,12.0
+2022-12-22,16.0
+2022-12-23,9.0
+2022-12-24,4.0
+2022-12-25,3.0
+2022-12-26,4.0
+2022-12-27,4.0
+2022-12-28,5.0
+2022-12-29,14.0
+2022-12-30,13.0
+2022-12-31,4.0
+2023-01-01,3.0
+2023-01-02,2.0
+2023-01-03,10.0
+2023-01-04,10.0
+2023-01-05,19.0
+2023-01-06,7.0
+2023-01-07,3.0
+2023-01-08,10.0
+2023-01-09,12.0
+2023-01-10,17.0
+2023-01-11,14.0
+2023-01-12,15.0
+2023-01-13,10.0
+2023-01-14,15.0
+2023-01-15,1.0
+2023-01-16,9.0
+2023-01-17,6.0
+2023-01-18,4.0
+2023-01-19,2.0
+2023-01-20,4.0
+2023-01-21,2.0
+2023-01-22,3.0
+2023-01-23,1.0
+2023-01-24,4.0
+2023-01-25,7.0
+2023-01-26,5.0
+2023-01-27,1.0
+2023-01-28,0.0
+2023-01-29,2.0
+2023-01-30,3.0
+2023-01-31,6.0
+2023-02-01,8.0
+2023-02-02,7.0
+2023-02-03,4.0
+2023-02-04,3.0
+2023-02-05,1.0
+2023-02-06,13.0
+2023-02-07,4.0
+2023-02-08,6.0
+2023-02-09,3.0
+2023-02-10,7.0
+2023-02-11,3.0
+2023-02-12,0.0
+2023-02-13,3.0

models/lstm_forec_40_11_06.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df812ddfa22e1d13ed44b8645a69b5a5c6d00ccd3008975f59dca3b9e3637b5c
+size 19807

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 gradio==5.29.0
-plotly==6.0.1

 gradio==5.29.0
+torch==2.7.0
+scikit-learn==1.6.1
+plotly==6.0.1
+statsmodels==0.14.4

sections/try_it_yourself.html CHANGED Viewed

@@ -4,7 +4,52 @@
         <li>📈 Upload a CSV file with two columns: dates in the first column and disease mention counts in the second</li>
         <li>🎯 Click "Plot Time Series" to visualize your data</li>
         <li>🔍 Select an anomaly detection method from the dropdown</li>
         <li>⚡ Click "Detect Anomalies" to identify unusual patterns in your time series</li>
     </ul>
-    <p>This tool combines time series analysis and anomaly detection to help identify potential disease outbreaks based on news coverage patterns. 💡</p>
 </section>

         <li>📈 Upload a CSV file with two columns: dates in the first column and disease mention counts in the second</li>
         <li>🎯 Click "Plot Time Series" to visualize your data</li>
         <li>🔍 Select an anomaly detection method from the dropdown</li>
+        <li>⚙️ Configure the detection parameters:
+            <ul>
+                <li><strong>For LSTM method:</strong>
+                    <ul>
+                        <li><em>k</em>: Controls sensitivity (1-3). Higher values mean stricter anomaly detection.</li>
+                        <li><em>Percentile</em>: Threshold percentile for anomaly detection (0-100).</li>
+                        <li><em>Threshold Method</em>: Choose how to calculate anomaly thresholds:
+                            <ul>
+                                <li>IQR-based methods: Compare predictions with actual values using different metrics</li>
+                                <li>Percentile-based methods: Use statistical thresholds on prediction errors</li>
+                            </ul>
+                        </li>
+                    </ul>
+                </li>
+                <li><strong>For ARIMA method:</strong>
+                    <ul>
+                        <li><em>k</em>: Sensitivity multiplier for standard deviation-based thresholds (1-3).</li>
+                    </ul>
+                </li>
+                <li><strong>For IQR method:</strong>
+                    <ul>
+                        <li><em>k</em>: IQR multiplier (1-3). Higher values detect more extreme outliers.</li>
+                    </ul>
+                </li>
+            </ul>
+        </li>
         <li>⚡ Click "Detect Anomalies" to identify unusual patterns in your time series</li>
     </ul>
+    <div class="example-section">
+        <h3>📋 Example Dataset</h3>
+        <p>Try out the tool with our sample dataset:</p>
+        <ul>
+            <li><strong>Dataset:</strong> <code>mpox.csv</code> - News coverage time series for Monkeypox/Mpox outbreak</li>
+            <li><strong>Time Period:</strong> Daily counts from early 2022</li>
+            <li><strong>Recommended Settings:</strong>
+                <ul>
+                    <li>Method: LSTM</li>
+                    <li>k: 1.5</li>
+                    <li>Percentile: 95</li>
+                    <li>Threshold Method: "IQR on |ground truth - forecast|"</li>
+                </ul>
+            </li>
+            <li><strong>Expected Results:</strong> The analysis should identify significant spikes in news coverage that corresponded to major outbreak events and public health announcements during the 2022 Mpox outbreak.</li>
+        </ul>
+    </div>
+    <p>This tool combines time series analysis and anomaly detection to help identify potential disease outbreaks based on news coverage patterns. The results can be used to alert public health officials about emerging health concerns. 💡</p>
 </section>

src/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from .plotting.visualization import plot_time_series, plot_anomalies
+from .anomaly_detection import detect_anomalies
+from .outbreak_detection import (
+    LSTMforOutbreakDetection,
+    ARIMAforOutbreakDetection,
+    IQRforOutbreakDetection
+)
+from .utils import (
+    timestamp_wise_evaluation,
+    tolerance_based_evaluation
+)
+__all__ = [
+    'plot_time_series',
+    'plot_anomalies',
+    'detect_anomalies',
+    'LSTMforOutbreakDetection',
+    'ARIMAforOutbreakDetection',
+    'IQRforOutbreakDetection',
+    'timestamp_wise_evaluation',
+    'tolerance_based_evaluation'
+    'prepare_time_series_dataframe',
+    'tolerance_based_evaluation',
+]

src/anomaly_detection.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pandas as pd
+from .outbreak_detection import (
+    LSTMforOutbreakDetection,
+    ARIMAforOutbreakDetection,
+    IQRforOutbreakDetection
+)
+from .plotting.visualization import plot_anomalies
+from .utils import prepare_time_series_dataframe
+THRESHOLD_METHODS = {
+    "IQR on (ground truth - forecast)": 0,
+    "IQR on |ground truth - forecast|": 1,
+    "IQR on |ground truth - forecast|/forecast": 2,
+    "Percentile threshold on absolute loss": 3,
+    "Percentile threshold on raw loss": 4
+}
+def detect_anomalies(file_path: str, method: str, k: int, percentile: float, threshold_method: int):
+    """
+    Detects anomalies in time series data using various detection methods.
+    Args:
+        file_path (str): Path to the CSV file containing time series data
+        method (str): Detection method to use ('LSTM', 'ARIMA', or 'IQR')
+        k (int): Number of neighbors or window size (method-dependent parameter)
+        percentile (float): Percentile threshold for anomaly detection
+        threshold_method (int): Method to determine threshold for anomaly detection
+    Returns:
+        plotly.graph_objects.Figure: Plotly figure containing the time series with highlighted anomalies
+    """
+    df = pd.read_csv(file_path)
+    df = prepare_time_series_dataframe(df)
+    # Map threshold methods to their descriptions for better readability
+    detectors = {
+        'LSTM': LSTMforOutbreakDetection(
+            checkpoint_path='models/lstm_forec_40_11_06.pth',
+            k=k,
+            percentile=percentile,
+            threshold_method=THRESHOLD_METHODS[threshold_method]
+        ),
+        'ARIMA': ARIMAforOutbreakDetection(k=k),
+        'IQR': IQRforOutbreakDetection(k=k)
+    }
+    detector = detectors[method]
+    test, new_label = detector.detect_anomalies(df)
+    return plot_anomalies(test, anomaly_col=new_label)

src/outbreak_detection/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .lstm import LSTMforOutbreakDetection
+from .arima import ARIMAforOutbreakDetection
+from .iqr import IQRforOutbreakDetection
+from .lstm_model import LstmModel, testing
+__all__ = [
+    'LSTMforOutbreakDetection',
+    'ARIMAforOutbreakDetection',
+    'IQRforOutbreakDetection',
+    'LstmModel',
+    'testing'
+]

src/outbreak_detection/arima.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import pandas as pd
+import numpy as np
+from statsmodels.tsa.stattools import adfuller, acf, pacf
+from statsmodels.tsa.arima.model import ARIMA
+NEW_ANOMALY_COLUMN_NAME = 'anomaly'
+class ARIMAforOutbreakDetection:
+    def __init__(self, window_size=7, stride=1, k=1.5, significance=0.05, max_lag=30):
+        self.window_size = window_size
+        self.stride = stride
+        self.k = k
+        self.significance = significance
+        self.max_lag = max_lag
+    def test_stationarity(self, ts_data, column=''):
+        if isinstance(ts_data, pd.Series):
+            adf_test = adfuller(ts_data, autolag='AIC')
+        else:
+            adf_test = adfuller(ts_data[column], autolag='AIC')
+        return "Stationary" if adf_test[1] <= self.significance else "Non-Stationary"
+    def make_stationary(self, dataframe, column):
+        df_to_return = None
+        result = self.test_stationarity(dataframe, column)
+        if result == "Stationary":
+            return dataframe
+        diff_series = dataframe.copy()
+        for diff_count in range(5):
+            diff_series = diff_series.diff().fillna(0)
+            if self.test_stationarity(diff_series, column) == "Stationary":
+                return diff_series
+        return diff_series
+    def create_windows(self, df):
+        windows, gts = [], []
+        for i in range(0, len(df) - self.window_size, self.stride):
+            end_id = i + self.window_size
+            windows.append(df.iloc[i:end_id, :])
+            gts.append(df.iloc[end_id, :])
+        return np.stack(windows), np.stack(gts)
+    def find_p_q(self, series):
+        N = len(series)
+        acf_values, _ = acf(series, nlags=self.max_lag, alpha=self.significance, fft=False)
+        pacf_values, _ = pacf(series, nlags=self.max_lag, alpha=self.significance)
+        threshold = 1.96 / np.sqrt(N)
+        def find_last_consecutive_outlier(values):
+            for i in range(1, len(values)):
+                if values[i] < 0 or (values[i] > 0 and abs(values[i]) < threshold):
+                    return i
+            return len(values) - 1
+        return find_last_consecutive_outlier(pacf_values), find_last_consecutive_outlier(acf_values)
+    def detect_anomalies(self, dataset, news_or_cases='news'):
+        stationary_data = self.make_stationary(dataset, news_or_cases)
+        p, q = self.find_p_q(stationary_data[news_or_cases])
+        anomalies, means, stdevs, residuals, predictions, gts = self._train_arima_model(stationary_data, p, q)
+        result_df = self._prepare_resulting_dataframe(
+            residuals, means, stdevs, dataset.iloc[self.window_size:],
+            anomalies, gts, predictions
+        )
+        return self._postprocess_anomalies(result_df, news_or_cases), NEW_ANOMALY_COLUMN_NAME
+    def _train_arima_model(self, dataset, p, q):
+        predictions, residuals, means, stdevs, anomalies = [], [], [], [], []
+        windows, gts = self.create_windows(dataset)
+        for window, gt in zip(windows, gts):
+            model = ARIMA(window, order=(p, 0, q))
+            model.initialize_approximate_diffuse()
+            fit = model.fit()
+            pred = fit.forecast(steps=1)[0]
+            residual = np.abs(gt - pred)
+            mu, std = np.mean(fit.resid), np.std(fit.resid)
+            anomalies.append(
+                1 if residual > mu + self.k * std or residual < mu - self.k * std else 0
+            )
+            means.append(mu)
+            stdevs.append(std)
+            residuals.append(residual)
+            predictions.append(pred)
+        return anomalies, means, stdevs, residuals, predictions, gts
+    def _prepare_resulting_dataframe(self, residuals, means, stdevs, original_dataset,
+                                   anomalies, gts, predictions):
+        result_df = original_dataset.copy()
+        result_df['residuals'] = residuals
+        result_df['mu'] = means
+        result_df['sigma'] = stdevs
+        result_df['anomaly'] = anomalies
+        result_df['gts_diff'] = gts
+        result_df['pred_diff'] = predictions
+        return result_df
+    def _postprocess_anomalies(self, dataframe, col_name='news'):
+        dataframe['derivative'] = dataframe[col_name].diff().fillna(0)
+        dataframe['new_anomaly'] = [
+            0 if row.derivative < 0 and row.anomaly == 1 else row.anomaly
+            for _, row in dataframe.iterrows()
+        ]
+        return dataframe

src/outbreak_detection/iqr.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pandas as pd
+import numpy as np
+NEW_LABEL_COLUMN_NAME = 'new_label'
+class IQRforOutbreakDetection:
+    def __init__(self, window_size=7, stride=1, k=1.5):
+        self.window_size = window_size
+        self.stride = stride
+        self.k = k
+    def _iqr_rolling(self, timeseries):
+        q1 = np.percentile(timeseries, 25)
+        q3 = np.percentile(timeseries, 75)
+        iqr = q3 - q1
+        ub = q3 + self.k * iqr
+        lb = q1 - self.k * iqr
+        return ub, lb
+    def detect_anomalies(self, df, news_or_cases='news'):
+        """"
+        input methods: k
+        """
+        if isinstance(df, pd.Series):
+            timeseries = df
+        else:
+            timeseries = df[news_or_cases]
+        tot_peaks, final_peaks, _ = self._windowed_iqr(timeseries)
+        result_df = self._prepare_resulting_dataframe(final_peaks, timeseries)
+        processed_df = self._postprocess_anomalies(result_df, news_or_cases)
+        print(processed_df)
+        return processed_df, NEW_LABEL_COLUMN_NAME
+    def _windowed_iqr(self, df):
+        tot_peaks = {}
+        for i in range(0, len(df) - self.window_size + 1, self.stride):
+            end_id = i + self.window_size
+            window = df[i:end_id]
+            ub, _ = self._iqr_rolling(window)
+            for j in window.index:
+                peaks_list = tot_peaks.setdefault(f'{j}', [])
+                peaks_list.append(window.loc[j] > ub)
+        final_peaks = {k: True if True in v else False
+                      for k, v in tot_peaks.items()}
+        return tot_peaks, final_peaks, end_id
+    def _prepare_resulting_dataframe(self, peaks_df, news_or_cases_df):
+        final_df_iqr = pd.DataFrame.from_dict(peaks_df, orient='index')
+        dff = pd.DataFrame(news_or_cases_df)
+        dff['peaks'] = final_df_iqr.loc[:, 0].values
+        dff['peaks'] = dff['peaks'].map({True: 1, False: 0})
+        return dff
+    def _postprocess_anomalies(self, dataframe, col_name='news'):
+        dataframe['derivative'] = dataframe[col_name].diff().fillna(0)
+        dataframe['new_label'] = [0 if v.derivative < 0 and v.peaks == 1 else v.peaks
+                                 for _, v in dataframe.iterrows()]
+        return dataframe

src/outbreak_detection/lstm.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import torch
+import numpy as np
+import pandas as pd
+import torch.utils.data as data_utils
+from sklearn.preprocessing import MinMaxScaler
+from .lstm_model import LstmModel, testing
+PRETRAINED_MODEL_N_CHANNELS = 1
+PRETRAINED_MODEL_Z_SIZE = 32
+class LSTMforOutbreakDetection:
+    def __init__(
+            self,
+            checkpoint_path=None,
+            n_channels=PRETRAINED_MODEL_N_CHANNELS,
+            z_size=PRETRAINED_MODEL_Z_SIZE,
+            device='cpu',
+            window=7,
+            batch_size=32,
+            k=1.5,
+            percentile=95,
+            threshold_method=0
+        ):
+        self.device = torch.device(device)
+        self.window = window
+        self.batch_size = batch_size
+        self.n_channels = n_channels
+        self.z_size = z_size
+        self.scaler = MinMaxScaler(feature_range=(0,1))
+        self.k = k
+        self.percentile = percentile
+        self.threshold_method = threshold_method
+        if checkpoint_path:
+            self.model = self._load_model(checkpoint_path)
+    def _load_model(self, checkpoint_path):
+        model = LstmModel(self.n_channels, self.z_size)
+        model = model.to(self.device)
+        model.load_state_dict(torch.load(checkpoint_path, map_location=self.device))
+        return model
+    def create_test_sequences(self, dataframe, time_steps, news_or_cases='news'):
+        if news_or_cases not in ['news', 'cases']:
+            raise ValueError("news_or_cases should be either 'news' or 'cases'")
+        output, output2 = [], []
+        dataframe[[news_or_cases]] = self.scaler.fit_transform(dataframe[[news_or_cases]])
+        norm = np.array(dataframe[[news_or_cases]]).astype(float)
+        for i in range(len(norm)):
+            end_ix = i + time_steps
+            if end_ix > len(norm)-1:
+                break
+            seq_x, seq_y = norm[i:end_ix, :], norm[end_ix, 0]
+            output.append(seq_x)
+            output2.append(seq_y)
+        return np.stack(output), np.stack(output2)
+    def prepare_input_dataframe(self, dataframe, news_column_name='news'):
+        X_test, y_test = self.create_test_sequences(dataframe, self.window, news_column_name)
+        test_loader = torch.utils.data.DataLoader(
+            data_utils.TensorDataset(
+                torch.from_numpy(X_test).float(),
+                torch.from_numpy(y_test).float()
+            ),
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=0
+        )
+        return test_loader, y_test
+    def predict(self, dataframe, news_column_name='news'):
+        test_loader, y_test = self.prepare_input_dataframe(dataframe, news_column_name)
+        results, w = testing(self.model, test_loader, self.device)
+        forecast_test = np.concatenate([
+            torch.stack(w[:-1]).flatten().detach().cpu().numpy(),
+            w[-1].flatten().detach().cpu().numpy()
+        ])
+        test_df = dataframe[self.window:].copy()
+        test_df['y_test'] = y_test
+        test_df['pred_forec'] = forecast_test
+        test_df['abs_loss'] = np.abs(test_df.y_test - test_df.pred_forec)
+        test_df['rel_loss'] = np.abs((test_df['pred_forec'] - test_df['y_test']) / (1 + test_df['pred_forec']))
+        test_df['diff'] = test_df['y_test'] - test_df['pred_forec']
+        return test_df
+    @staticmethod
+    def _iqr_rolling(timeseries, k):
+        q1, q3 = np.percentile(timeseries, [25, 75])
+        iqr = q3 - q1
+        return q3 + k * iqr
+    def windowed_iqr(self, df, k, type_of_loss='diff'):
+        peaks = {}
+        for i in range(len(df)):
+            end_ix = i + self.window
+            if end_ix > len(df)-1:
+                break
+            seq_x = df.iloc[i:end_ix, :]
+            ub = self._iqr_rolling(seq_x[type_of_loss], k)
+            for j in seq_x.index:
+                condition = int(seq_x.loc[j, type_of_loss] > ub)
+                peaks.setdefault(f'{j}', []).append(condition)
+        return {k: 1 if sum(v) > 0 else 0 for k, v in peaks.items()}
+    def get_perc_threshold(self, test_df, percentile, col='abs_loss'):
+        if col not in ['abs_loss', 'loss']:
+            raise ValueError("col should be either 'abs_loss' or 'loss'")
+        test1 = test_df[:-1].copy()
+        anom_perc_loss = {}
+        for i in range(len(test_df)):
+            end_ix = i + self.window
+            if end_ix > len(test_df)-1:
+                break
+            seq_x = test_df.iloc[i:end_ix, :].copy()
+            mae = seq_x['abs_loss'].values if col == 'abs_loss' else seq_x['y_test'] - seq_x['pred_forec']
+            threshold = np.percentile(mae, percentile)
+            seq_x['threshold'] = threshold
+            for j in seq_x.index:
+                condition = int(seq_x.loc[j, col] > seq_x.loc[j, 'threshold'])
+                anom_perc_loss.setdefault(f'{j}', []).append(condition)
+        final_anom = {k: 1 if sum(v) > 0 else 0 for k, v in anom_perc_loss.items()}
+        new_col = 'anom_perc_abs_loss' if col == 'abs_loss' else 'anom_perc_diff_gt_pred'
+        test1[new_col] = pd.Series(final_anom)
+        return test1
+    def postprocess_anomalies(self, test_df, new_col, old_col, news_or_cases):
+        test_df = test_df.copy()
+        test_df['derivative'] = test_df[news_or_cases].diff().fillna(0)
+        test_df[new_col] = [0 if v.derivative < 0 and v[old_col] == 1 else v[old_col]
+                           for k, v in test_df.iterrows()]
+        return test_df
+    def detect_anomalies(self, test_df, news_or_cases='news'):
+        """
+        Detect anomalies using different methods:
+        0: IQR on (ground truth - forecast)
+        1: IQR on |ground truth - forecast|
+        2: IQR on |ground truth - forecast|/forecast
+        3: Percentile threshold on absolute loss
+        4: Percentile threshold on raw loss
+        input parameters: k (1-3), threshold_method, percentile
+        """
+        test_df = test_df.copy()
+        test = self.predict(test_df, news_column_name=news_or_cases)
+        if self.threshold_method in [0, 1, 2]:
+            loss_type = {0: 'diff', 1: 'abs_loss', 2: 'rel_loss'}[self.threshold_method]
+            iqr_suffix = {0: 'f_iqr', 1: 'abs_iqr', 2: 'rel_iqr'}[self.threshold_method]
+            new_label = {0: 'f_new_label', 1: 'abs_new_label', 2: 'rel_new_label'}[self.threshold_method]
+            peaks = self.windowed_iqr(test, self.k, loss_type)
+            peak_series = pd.Series(peaks)
+            peak_series.index = pd.to_datetime(peak_series.index)
+            test[iqr_suffix] = peak_series
+            test = self.postprocess_anomalies(test, new_label, iqr_suffix, news_or_cases)
+            return test, new_label
+        elif self.threshold_method in [3, 4]:
+            loss_type = 'abs_loss' if self.threshold_method == 3 else 'loss'
+            new_label = 'new_anom_absl' if self.threshold_method == 3 else 'new_anom_diff'
+            old_label = 'anom_perc_abs_loss' if self.threshold_method == 3 else 'anom_perc_diff_gt_pred'
+            test = self.get_perc_threshold(test, self.percentile, loss_type)
+            test = self.postprocess_anomalies(test, new_label, old_label, news_or_cases)
+            return test, new_label
+        raise ValueError("threshold_method must be between 0 and 4")

src/outbreak_detection/lstm_model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+class LstmModel(nn.Module):
+  def __init__(self, in_size: int, latent_size: int) -> None:
+    """
+    Initialize the LSTM autoencoder model.
+    Parameters:
+    -----------
+    in_size : int
+      Number of features in the input (input dimension).
+    latent_size : int
+      Size of the latent space representation in the LSTM.
+    Example:
+    --------
+    For in_size = 5, latent_size = 50, the model will:
+    - Take inputs with 5 features
+    - Encode them into a 50-dimensional latent space
+    - Decode back to the original 5 feature dimensions
+    Architecture:
+    - LSTM layer for encoding with dropout 0.2
+    - Additional dropout layer (0.2)
+    - ReLU activation
+    - Fully connected layer for decoding
+    """
+    super().__init__()  # Corrected the position of super().__init__()
+    self.lstm = nn.LSTM(
+      input_size=in_size,
+      hidden_size=latent_size,
+      num_layers=1,
+      batch_first=True,
+      dropout=0.2
+    )  # input and output tensors are provided as (batch, seq_len, feature(size))
+    self.dropout = nn.Dropout(0.2)
+    self.relu = nn.ReLU()
+    self.fc = nn.Linear(latent_size, in_size)
+  def forward(self, w: torch.Tensor) -> torch.Tensor:
+    """
+    Forward pass through the LSTM model.
+    Parameters:
+    -----------
+    w : torch.Tensor
+      Input tensor of shape (batch_size, seq_len, in_size).
+    Returns:
+    --------
+    torch.Tensor
+      Output tensor of shape (batch_size, in_size).
+    Example:
+    --------
+    If the input tensor w has shape (32, 10, 5), the output will have shape (32, 5).
+    The LSTM processes the input sequence and returns the last output of the sequence.
+    The output is then passed through a ReLU activation function and a dropout layer.
+    Finally, it is passed through a fully connected layer to produce the final output.
+    The output is the reconstructed input sequence.
+    """
+    z, (h_n, c_n) = self.lstm(w)
+    forecast = z[:, -1, :]
+    forecast = self.relu(forecast)
+    forecast = self.dropout(forecast)
+    output = self.fc(forecast)
+    return output
+def testing(model, test_loader, device):
+    results=[]
+    forecast = []
+    with torch.no_grad():
+        for X_batch, y_batch in test_loader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            w=model(X_batch)
+            results.append(torch.mean((y_batch.unsqueeze(1)-w)**2, axis=1))
+            forecast.append(w)
+    return results, forecast

src/plotting/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .visualization import plot_time_series, plot_anomalies
2	+
3	+ __all__ = ['plot_time_series', 'plot_anomalies']

src/plotting/visualization.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+import plotly.graph_objects as go
+NEWS_COUNT_COLUMN = 0
+def plot_time_series(file_path):
+    df = pd.read_csv(file_path)
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=df.iloc[:, 0], y=df.iloc[:, 1], mode='lines', name='Time Series'))
+    fig.update_layout(title='Disease Mention Time Series', xaxis_title='Date', yaxis_title='Count')
+    return fig
+def plot_anomalies(df, anomaly_col='new_label'):
+    print(df)
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(
+        x=df.index,
+        y=df.iloc[:, NEWS_COUNT_COLUMN],
+        mode='lines',
+        name='Time Series',
+        line=dict(color='blue')
+    ))
+    anomalies = df[df[anomaly_col] == 1]
+    fig.add_trace(go.Scatter(
+        x=anomalies.index,
+        y=anomalies.iloc[:, NEWS_COUNT_COLUMN],
+        mode='markers',
+        name='Anomalies',
+        marker=dict(color='red', size=10, symbol='circle')
+    ))
+    fig.update_layout(
+        title='Disease Mention Time Series with Detected Anomalies',
+        xaxis_title='Date',
+        yaxis_title='Count',
+        showlegend=True
+    )
+    return fig

src/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import gradio as gr
+import pandas as pd
+from sklearn.metrics import classification_report
+def timestamp_wise_evaluation(anomalies_cases, anomalies_news, threshold):
+    print(f"Classification report for threshold {threshold} (timestamp-wise evaluation):")
+    print(classification_report(anomalies_cases, anomalies_news))
+def tolerance_based_evaluation(anomalies_cases, anomalies_news, cases_df, news_df, threshold):
+    Tp = 0
+    Fp = 0
+    Fn = 0
+    Tn = 0
+    for i in range(len(news_df)):
+        news_an = news_df.iloc[i][anomalies_news]
+        if news_an == 1:
+            if i == len(news_df) - 1:
+                if cases_df.iloc[i][anomalies_cases] == 1:
+                    Tp += 1
+                else:
+                    Fp += 1
+            elif i == len(news_df) - 2:
+                if cases_df.iloc[i][anomalies_cases] == 1 or cases_df.iloc[i+1][anomalies_cases] == 1:
+                    Tp += 1
+                else:
+                    Fp += 1
+            else:
+                if cases_df.iloc[i][anomalies_cases] == 1 or cases_df.iloc[i+1][anomalies_cases] == 1 or cases_df.iloc[i+2][anomalies_cases] == 1:
+                    Tp += 1
+                else:
+                    Fp += 1
+        else:
+            if i == len(news_df) - 1:
+                if cases_df.iloc[i][anomalies_cases] == 1:
+                    Fn += 1
+                else:
+                    Tn += 1
+            elif i == len(news_df) - 2:
+                if cases_df.iloc[i][anomalies_cases] == 1:
+                    Fn += 1
+                else:
+                    Tn += 1
+            else:
+                if cases_df.iloc[i][anomalies_cases] == 1:
+                    Fn += 1
+                else:
+                    Tn += 1
+    print(f"Tolerance-based evaluation for method {threshold}:")
+    print(f"True Positives: {Tp}, False Positives: {Fp}, False Negatives: {Fn}, True Negatives: {Tn}")
+    precision = Tp / (Tp + Fp)
+    recall = Tp / (Tp + Fn)
+    f1 = 2 * (precision * recall) / (precision + recall)
+    print(f"Precision: {precision}, Recall: {recall}, F1: {f1}")
+def prepare_time_series_dataframe(df):
+    """Prepare dataframe for time series analysis by setting datetime index and renaming columns"""
+    df.set_index(df.columns[0], inplace=True)
+    try:
+        df.index = pd.to_datetime(df.index)
+    except ValueError:
+        raise ValueError("The first column of the CSV file must be a datetime column.")
+    df.rename(columns={df.columns[0]: "news"}, inplace=True)
+    return df
+def update_controls(method):
+    """
+    Updates the interactivity of control elements based on the selected method.
+    Args:
+        method (str): The selected anomaly detection method
+    Returns:
+        dict: Update configuration for Gradio components
+    """
+    is_lstm = method == "LSTM"
+    return [
+        gr.update(interactive=is_lstm),
+        gr.update(interactive=is_lstm)
+    ]

src/visualization.py DELETED Viewed

@@ -1,20 +0,0 @@
-import pandas as pd
-import plotly.express as px
-def plot_time_series(file):
-    """
-    Plots a time series graph from a CSV file.
-    This function reads the CSV file and generates a line plot
-    showing the disease mentions over time.
-    """
-    df = pd.read_csv(file.name)
-    fig = px.line(
-        df,
-        x=df.columns[0],
-        y=df.columns[1],
-        title='Disease Mentions Over Time'
-    )
-    return fig