amueller
diff --git a/‎benchmarks/bench_lof.py
Lines changed: 119 additions & 0 deletions b/‎benchmarks/bench_lof.py
Lines changed: 119 additions & 0 deletions
diff --git a/‎doc/modules/classes.rst
Lines changed: 2 additions & 1 deletion b/‎doc/modules/classes.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/modules/outlier_detection.rst
Lines changed: 75 additions & 14 deletions b/‎doc/modules/outlier_detection.rst
Lines changed: 75 additions & 14 deletions
diff --git a/‎doc/whats_new.rst
Lines changed: 4 additions & 1 deletion b/‎doc/whats_new.rst
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/covariance/plot_outlier_detection.py
Lines changed: 29 additions & 12 deletions b/‎examples/covariance/plot_outlier_detection.py
Lines changed: 29 additions & 12 deletions
@@ -0,0 +1,119 @@
+"""
+============================
+LocalOutlierFactor benchmark
+============================
+
+A test of LocalOutlierFactor on classical anomaly detection datasets.
+
+"""
+
+from time import time
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.metrics import roc_curve, auc
+from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils import shuffle as sh
+
+print(__doc__)
+
+np.random.seed(2)
+
+# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
+datasets = ['shuttle']
+
+novelty_detection = True  # if False, training set polluted by outliers
+
+for dataset_name in datasets:
+    # loading and vectorization
+    print('loading data')
+    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
+        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
+                                 percent10=False)
+        X = dataset.data
+        y = dataset.target
+
+    if dataset_name == 'shuttle':
+        dataset = fetch_mldata('shuttle')
+        X = dataset.data
+        y = dataset.target
+        X, y = sh(X, y)
+        # we remove data with label 4
+        # normal data are then those of class 1
+        s = (y != 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 1).astype(int)
+
+    if dataset_name == 'forestcover':
+        dataset = fetch_covtype(shuffle=True)
+        X = dataset.data
+        y = dataset.target
+        # normal data are those with attribute 2
+        # abnormal those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+
+    print('vectorizing data')
+
+    if dataset_name == 'SF':
+        lb = LabelBinarizer()
+        lb.fit(X[:, 1])
+        x1 = lb.transform(X[:, 1])
+        X = np.c_[X[:, :1], x1, X[:, 2:]]
+        y = (y != 'normal.').astype(int)
+
+    if dataset_name == 'SA':
+        lb = LabelBinarizer()
+        lb.fit(X[:, 1])
+        x1 = lb.transform(X[:, 1])
+        lb.fit(X[:, 2])
+        x2 = lb.transform(X[:, 2])
+        lb.fit(X[:, 3])
+        x3 = lb.transform(X[:, 3])
+        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != 'normal.').astype(int)
+
+    if dataset_name == 'http' or dataset_name == 'smtp':
+        y = (y != 'normal.').astype(int)
+
+    n_samples, n_features = np.shape(X)
+    n_samples_train = n_samples // 2
+    n_samples_test = n_samples - n_samples_train
+
+    X = X.astype(float)
+    X_train = X[:n_samples_train, :]
+    X_test = X[n_samples_train:, :]
+    y_train = y[:n_samples_train]
+    y_test = y[n_samples_train:]
+
+    if novelty_detection:
+        X_train = X_train[y_train == 0]
+        y_train = y_train[y_train == 0]
+
+    print('LocalOutlierFactor processing...')
+    model = LocalOutlierFactor(n_neighbors=20)
+    tstart = time()
+    model.fit(X_train)
+    fit_time = time() - tstart
+    tstart = time()
+
+    scoring = -model.decision_function(X_test)  # the lower, the more normal
+    predict_time = time() - tstart
+    fpr, tpr, thresholds = roc_curve(y_test, scoring)
+    AUC = auc(fpr, tpr)
+    plt.plot(fpr, tpr, lw=1,
+             label=('ROC for %s (area = %0.3f, train-time: %0.2fs,'
+                    'test-time: %0.2fs)' % (dataset_name, AUC, fit_time,
+                                            predict_time)))
+
+plt.xlim([-0.05, 1.05])
+plt.ylim([-0.05, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('Receiver operating characteristic')
+plt.legend(loc="lower right")
+plt.show()
@@ -1050,7 +1050,8 @@ See the :ref:`metrics` section of the user guide for further details.
    neighbors.LSHForest
    neighbors.DistanceMetric
    neighbors.KernelDensity
-
+   neighbors.LocalOutlierFactor
+	      
 .. autosummary::
    :toctree: generated/
    :template: function.rst
 
@@ -165,18 +165,76 @@ This strategy is illustrated below.
 
    * See :ref:`sphx_glr_auto_examples_covariance_plot_outlier_detection.py` for a
      comparison of :class:`ensemble.IsolationForest` with
+     :class:`neighbors.LocalOutlierFactor`,
      :class:`svm.OneClassSVM` (tuned to perform like an outlier detection
      method) and a covariance-based outlier detection with
-     :class:`covariance.MinCovDet`.
+     :class:`covariance.EllipticEnvelope`.
 
 .. topic:: References:
 
     .. [LTZ2008] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
            Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
 
 
-One-class SVM versus Elliptic Envelope versus Isolation Forest
---------------------------------------------------------------
+Local Outlier Factor
+--------------------
+Another efficient way to perform outlier detection on moderately high dimensional
+datasets is to use the Local Outlier Factor (LOF) algorithm.
+
+The :class:`neighbors.LocalOutlierFactor` (LOF) algorithm computes a score
+(called local outlier factor) reflecting the degree of abnormality of the
+observations.
+It measures the local density deviation of a given data point with respect to
+its neighbors. The idea is to detect the samples that have a substantially
+lower density than their neighbors.
+
+In practice the local density is obtained from the k-nearest neighbors.
+The LOF score of an observation is equal to the ratio of the
+average local density of his k-nearest neighbors, and its own local density:
+a normal instance is expected to have a local density similar to that of its
+neighbors, while abnormal data are expected to have much smaller local density.
+
+The number k of neighbors considered, (alias parameter n_neighbors) is typically
+chosen 1) greater than the minimum number of objects a cluster has to contain,
+so that other objects can be local outliers relative to this cluster, and 2)
+smaller than the maximum number of close by objects that can potentially be
+local outliers.
+In practice, such informations are generally not available, and taking
+n_neighbors=20 appears to work well in general.
+When the proportion of outliers is high (i.e. greater than 10 \%, as in the
+example below), n_neighbors should be greater (n_neighbors=35 in the example
+below).
+
+The strength of the LOF algorithm is that it takes both local and global
+properties of datasets into consideration: it can perform well even in datasets
+where abnormal samples have different underlying densities.
+The question is not, how isolated the sample is, but how isolated it is
+with respect to the surrounding neighborhood.
+
+This strategy is illustrated below.
+
+.. figure:: ../auto_examples/neighbors/images/sphx_glr_plot_lof_001.png
+   :target: ../auto_examples/neighbors/plot_lof.html
+   :align: center
+   :scale: 75%
+
+.. topic:: Examples:
+
+   * See :ref:`sphx_glr_auto_example_neighbors_plot_lof.py` for
+     an illustration of the use of :class:`neighbors.LocalOutlierFactor`.
+
+   * See :ref:`sphx_glr_auto_example_covariance_plot_outlier_detection.py` for a
+     comparison with other anomaly detection methods.
+
+.. topic:: References:
+
+   .. [BKNS2000]  Breunig, Kriegel, Ng, and Sander (2000)
+      `LOF: identifying density-based local outliers.
+      <http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf>`_
+      Proc. ACM SIGMOD
+
+One-class SVM versus Elliptic Envelope versus Isolation Forest versus LOF
+-------------------------------------------------------------------------
 
 Strictly-speaking, the One-class SVM is not an outlier-detection method,
 but a novelty-detection method: its training set should not be
@@ -188,7 +246,8 @@ results in these situations.
 The examples below illustrate how the performance of the
 :class:`covariance.EllipticEnvelope` degrades as the data is less and
 less unimodal. The :class:`svm.OneClassSVM` works better on data with
-multiple modes and :class:`ensemble.IsolationForest` performs well in every cases.
+multiple modes and :class:`ensemble.IsolationForest` and
+:class:`neighbors.LocalOutlierFactor` perform well in every cases.
 
 .. |outlier1| image:: ../auto_examples/covariance/images/sphx_glr_plot_outlier_detection_001.png
    :target: ../auto_examples/covariance/plot_outlier_detection.html
@@ -202,7 +261,7 @@ multiple modes and :class:`ensemble.IsolationForest` performs well in every case
    :target: ../auto_examples/covariance/plot_outlier_detection.html
    :scale: 50%
 
-.. list-table:: **Comparing One-class SVM approach, and elliptic envelope**
+.. list-table:: **Comparing One-class SVM, Isolation Forest, LOF, and Elliptic Envelope**
    :widths: 40 60
 
    *
@@ -213,31 +272,33 @@ multiple modes and :class:`ensemble.IsolationForest` performs well in every case
         opposite, the decision rule based on fitting an
         :class:`covariance.EllipticEnvelope` learns an ellipse, which
         fits well the inlier distribution. The :class:`ensemble.IsolationForest`
-	performs as well.
-      - |outlier1|
+	and :class:`neighbors.LocalOutlierFactor` perform as well.
+      - |outlier1| 
 
    *
       - As the inlier distribution becomes bimodal, the
         :class:`covariance.EllipticEnvelope` does not fit well the
-        inliers. However, we can see that both :class:`ensemble.IsolationForest`
-	and :class:`svm.OneClassSVM` have difficulties to detect the two modes,
+        inliers. However, we can see that :class:`ensemble.IsolationForest`,
+	:class:`svm.OneClassSVM` and :class:`neighbors.LocalOutlierFactor`
+	have difficulties to detect the two modes,
 	and that the :class:`svm.OneClassSVM`
-        tends to overfit: because it has not model of inliers, it
+        tends to overfit: because it has no model of inliers, it
         interprets a region where, by chance some outliers are
         clustered, as inliers.
       - |outlier2|
 
    *
       - If the inlier distribution is strongly non Gaussian, the
         :class:`svm.OneClassSVM` is able to recover a reasonable
-        approximation as well as :class:`ensemble.IsolationForest`,
+        approximation as well as :class:`ensemble.IsolationForest`
+        and :class:`neighbors.LocalOutlierFactor`,
 	whereas the :class:`covariance.EllipticEnvelope` completely fails.
       - |outlier3|
 
 .. topic:: Examples:
 
    * See :ref:`sphx_glr_auto_examples_covariance_plot_outlier_detection.py` for a
      comparison of the :class:`svm.OneClassSVM` (tuned to perform like
-     an outlier detection method), the :class:`ensemble.IsolationForest`
-     and a covariance-based outlier
-     detection with :class:`covariance.MinCovDet`.
+     an outlier detection method), the :class:`ensemble.IsolationForest`,
+     the :class:`neighbors.LocalOutlierFactor`
+     and a covariance-based outlier detection :class:`covariance.EllipticEnvelope`.
@@ -16,6 +16,9 @@ Changelog
 New features
 ............
 
+   - Added the :class:`neighbors.LocalOutlierFactor` class for anomaly detection based
+     on nearest neighbors. By `Nicolas Goix`_ and `Alexandre Gramfort`_.
+
 Enhancements
 ............
 
@@ -4740,7 +4743,7 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 
 .. _Eric Martin: http://www.ericmart.in
 
-.. _Nicolas Goix: https://webperso.telecom-paristech.fr/front/frontoffice.php?SP_ID=241
+.. _Nicolas Goix: https://perso.telecom-paristech.fr/~goix/
 
 .. _Cory Lorenz: https://github.com/clorenz7
 
 
@@ -18,6 +18,9 @@
   hence more adapted to large-dimensional settings, even if it performs
   quite well in the examples below.
 
+- using the Local Outlier Factor to measure the local deviation of a given
+  data point with respect to its neighbors by comparing their local density.
+
 The ground truth about inliers and outliers is given by the points colors
 while the orange-filled area indicates which points are reported as inliers
 by each method.
@@ -27,7 +30,6 @@
 threshold on the decision_function to separate out the corresponding
 fraction.
 """
-print(__doc__)
 
 import numpy as np
 from scipy import stats
@@ -37,6 +39,9 @@
 from sklearn import svm
 from sklearn.covariance import EllipticEnvelope
 from sklearn.ensemble import IsolationForest
+from sklearn.neighbors import LocalOutlierFactor
+
+print(__doc__)
 
 rng = np.random.RandomState(42)
 
@@ -52,10 +57,13 @@
     "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
     "Isolation Forest": IsolationForest(max_samples=n_samples,
                                         contamination=outliers_fraction,
-                                        random_state=rng)}
+                                        random_state=rng),
+    "Local Outlier Factor": LocalOutlierFactor(
+        n_neighbors=35,
+        contamination=outliers_fraction)}
 
 # Compare given classifiers under given settings
-xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
+xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
 n_inliers = int((1. - outliers_fraction) * n_samples)
 n_outliers = int(outliers_fraction * n_samples)
 ground_truth = np.ones(n_samples, dtype=int)
@@ -72,19 +80,27 @@
     X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
 
     # Fit the model
-    plt.figure(figsize=(10.8, 3.6))
+    plt.figure(figsize=(9, 7))
     for i, (clf_name, clf) in enumerate(classifiers.items()):
         # fit the data and tag outliers
-        clf.fit(X)
-        scores_pred = clf.decision_function(X)
+        if clf_name == "Local Outlier Factor":
+            y_pred = clf.fit_predict(X)
+            scores_pred = clf.negative_outlier_factor_
+        else:
+            clf.fit(X)
+            scores_pred = clf.decision_function(X)
+            y_pred = clf.predict(X)
         threshold = stats.scoreatpercentile(scores_pred,
                                             100 * outliers_fraction)
-        y_pred = clf.predict(X)
         n_errors = (y_pred != ground_truth).sum()
         # plot the levels lines and the points
-        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
+        if clf_name == "Local Outlier Factor":
+            # decision_function is private for LOF
+            Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
+        else:
+            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
         Z = Z.reshape(xx.shape)
-        subplot = plt.subplot(1, 3, i + 1)
+        subplot = plt.subplot(2, 2, i + 1)
         subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                          cmap=plt.cm.Blues_r)
         a = subplot.contour(xx, yy, Z, levels=[threshold],
@@ -97,11 +113,12 @@
         subplot.legend(
             [a.collections[0], b, c],
             ['learned decision function', 'true inliers', 'true outliers'],
-            prop=matplotlib.font_manager.FontProperties(size=11),
+            prop=matplotlib.font_manager.FontProperties(size=10),
             loc='lower right')
-        subplot.set_title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
+        subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
         subplot.set_xlim((-7, 7))
         subplot.set_ylim((-7, 7))
-    plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)
+    plt.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
+    plt.suptitle("Outlier detection")
 
 plt.show()