BVB Source Codes

TextBlob Show classifiers.py Source code

Return Download TextBlob: download classifiers.py Source code - Download TextBlob Source code - Type:.py
  1. # -*- coding: utf-8 -*-
  2. """Various classifier implementations. Also includes basic feature extractor
  3. methods.
  4.  
  5. Example Usage:
  6. ::
  7.  
  8.    >>> from textblob import TextBlob
  9.    >>> from textblob.classifiers import NaiveBayesClassifier
  10.    >>> train = [
  11.    ...     ('I love this sandwich.', 'pos'),
  12.    ...     ('This is an amazing place!', 'pos'),
  13.    ...     ('I feel very good about these beers.', 'pos'),
  14.    ...     ('I do not like this restaurant', 'neg'),
  15.    ...     ('I am tired of this stuff.', 'neg'),
  16.    ...     ("I can't deal with this", 'neg'),
  17.    ...     ("My boss is horrible.", "neg")
  18.    ... ]
  19.    >>> cl = NaiveBayesClassifier(train)
  20.    >>> cl.classify("I feel amazing!")
  21.    'pos'
  22.    >>> blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl)
  23.    >>> for s in blob.sentences:
  24.    ...     print(s)
  25.    ...     print(s.classify())
  26.    ...
  27.    The beer is good.
  28.    pos
  29.    But the hangover is horrible.
  30.    neg
  31.  
  32. .. versionadded:: 0.6.0
  33. """
  34. from __future__ import absolute_import
  35. from itertools import chain
  36.  
  37. import nltk
  38.  
  39. from textblob.compat import basestring
  40. from textblob.decorators import cached_property
  41. from textblob.exceptions import FormatError
  42. from textblob.tokenizers import word_tokenize
  43. from textblob.utils import strip_punc, is_filelike
  44. import textblob.formats as formats
  45.  
  46. ### Basic feature extractors ###
  47.  
  48.  
  49. def _get_words_from_dataset(dataset):
  50.     """Return a set of all words in a dataset.
  51.  
  52.    :param dataset: A list of tuples of the form ``(words, label)`` where
  53.        ``words`` is either a string of a list of tokens.
  54.    """
  55.     # Words may be either a string or a list of tokens. Return an iterator
  56.     # of tokens accordingly
  57.     def tokenize(words):
  58.         if isinstance(words, basestring):
  59.             return word_tokenize(words, include_punc=False)
  60.         else:
  61.             return words
  62.     all_words = chain.from_iterable(tokenize(words) for words, _ in dataset)
  63.     return set(all_words)
  64.  
  65. def _get_document_tokens(document):
  66.     if isinstance(document, basestring):
  67.         tokens = set((strip_punc(w, all=False)
  68.                     for w in word_tokenize(document, include_punc=False)))
  69.     else:
  70.         tokens = set(strip_punc(w, all=False) for w in document)
  71.     return tokens
  72.  
  73. def basic_extractor(document, train_set):
  74.     """A basic document feature extractor that returns a dict indicating
  75.    what words in ``train_set`` are contained in ``document``.
  76.  
  77.    :param document: The text to extract features from. Can be a string or an iterable.
  78.    :param list train_set: Training data set, a list of tuples of the form
  79.        ``(words, label)``.
  80.    """
  81.     word_features = _get_words_from_dataset(train_set)
  82.     tokens = _get_document_tokens(document)
  83.     features = dict(((u'contains({0})'.format(word), (word in tokens))
  84.                                             for word in word_features))
  85.     return features
  86.  
  87.  
  88. def contains_extractor(document):
  89.     """A basic document feature extractor that returns a dict of words that
  90.    the document contains.
  91.    """
  92.     tokens = _get_document_tokens(document)
  93.     features = dict((u'contains({0})'.format(w), True) for w in tokens)
  94.     return features
  95.  
  96. ##### CLASSIFIERS #####
  97.  
  98. class BaseClassifier(object):
  99.     """Abstract classifier class from which all classifers inherit. At a
  100.    minimum, descendant classes must implement a ``classify`` method and have
  101.    a ``classifier`` property.
  102.  
  103.    :param train_set: The training set, either a list of tuples of the form
  104.        ``(text, classification)`` or a file-like object. ``text`` may be either
  105.        a string or an iterable.
  106.    :param callable feature_extractor: A feature extractor function that takes one or
  107.        two arguments: ``document`` and ``train_set``.
  108.    :param str format: If ``train_set`` is a filename, the file format, e.g.
  109.        ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
  110.        file format.
  111.    :param kwargs: Additional keyword arguments are passed to the constructor
  112.        of the :class:`Format <textblob.formats.BaseFormat>` class used to
  113.        read the data. Only applies when a file-like object is passed as
  114.        ``train_set``.
  115.  
  116.    .. versionadded:: 0.6.0
  117.    """
  118.  
  119.     def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **kwargs):
  120.         self.format_kwargs = kwargs
  121.         self.feature_extractor = feature_extractor
  122.         if is_filelike(train_set):
  123.             self.train_set = self._read_data(train_set, format)
  124.         else:  # train_set is a list of tuples
  125.             self.train_set = train_set
  126.         self.train_features = None
  127.  
  128.     def _read_data(self, dataset, format=None):
  129.         """Reads a data file and returns an iterable that can be used
  130.        as testing or training data.
  131.        """
  132.         # Attempt to detect file format if "format" isn't specified
  133.         if not format:
  134.             format_class = formats.detect(dataset)
  135.             if not format_class:
  136.                 raise FormatError('Could not automatically detect format for the given '
  137.                                   'data source.')
  138.         else:
  139.             registry = formats.get_registry()
  140.             if format not in registry.keys():
  141.                 raise ValueError("'{0}' format not supported.".format(format))
  142.             format_class = registry[format]
  143.         return format_class(dataset, **self.format_kwargs).to_iterable()
  144.  
  145.     @cached_property
  146.     def classifier(self):
  147.         """The classifier object."""
  148.         raise NotImplementedError('Must implement the "classifier" property.')
  149.  
  150.     def classify(self, text):
  151.         """Classifies a string of text."""
  152.         raise NotImplementedError('Must implement a "classify" method.')
  153.  
  154.     def train(self, labeled_featureset):
  155.         """Trains the classifier."""
  156.         raise NotImplementedError('Must implement a "train" method.')
  157.  
  158.     def labels(self):
  159.         """Returns an iterable containing the possible labels."""
  160.         raise NotImplementedError('Must implement a "labels" method.')
  161.  
  162.     def extract_features(self, text):
  163.         '''Extracts features from a body of text.
  164.  
  165.        :rtype: dictionary of features
  166.        '''
  167.         # Feature extractor may take one or two arguments
  168.         try:
  169.             return self.feature_extractor(text, self.train_set)
  170.         except (TypeError, AttributeError):
  171.             return self.feature_extractor(text)
  172.  
  173.  
  174. class NLTKClassifier(BaseClassifier):
  175.     """An abstract class that wraps around the nltk.classify module.
  176.  
  177.    Expects that descendant classes include a class variable ``nltk_class``
  178.    which is the class in the nltk.classify module to be wrapped.
  179.  
  180.    Example: ::
  181.  
  182.        class MyClassifier(NLTKClassifier):
  183.            nltk_class = nltk.classify.svm.SvmClassifier
  184.    """
  185.  
  186.     #: The NLTK class to be wrapped. Must be a class within nltk.classify
  187.     nltk_class = None
  188.  
  189.     def __init__(self, train_set,
  190.                  feature_extractor=basic_extractor, format=None, **kwargs):
  191.         super(NLTKClassifier, self).__init__(train_set, feature_extractor, format, **kwargs)
  192.         self.train_features = [(self.extract_features(d), c) for d, c in self.train_set]
  193.  
  194.     def __repr__(self):
  195.         class_name = self.__class__.__name__
  196.         return "<{cls} trained on {n} instances>".format(cls=class_name,
  197.                                                         n=len(self.train_set))
  198.  
  199.     @cached_property
  200.     def classifier(self):
  201.         """The classifier."""
  202.         try:
  203.             return self.train()
  204.         except AttributeError:  # nltk_class has not been defined
  205.             raise ValueError("NLTKClassifier must have a nltk_class"
  206.                             " variable that is not None.")
  207.  
  208.     def train(self, *args, **kwargs):
  209.         """Train the classifier with a labeled feature set and return
  210.        the classifier. Takes the same arguments as the wrapped NLTK class.
  211.        This method is implicitly called when calling ``classify`` or
  212.        ``accuracy`` methods and is included only to allow passing in arguments
  213.        to the ``train`` method of the wrapped NLTK class.
  214.  
  215.        .. versionadded:: 0.6.2
  216.  
  217.        :rtype: A classifier
  218.        """
  219.         try:
  220.             self.classifier = self.nltk_class.train(self.train_features,
  221.                                                     *args, **kwargs)
  222.             return self.classifier
  223.         except AttributeError:
  224.             raise ValueError("NLTKClassifier must have a nltk_class"
  225.                             " variable that is not None.")
  226.  
  227.     def labels(self):
  228.         """Return an iterable of possible labels."""
  229.         return self.classifier.labels()
  230.  
  231.     def classify(self, text):
  232.         """Classifies the text.
  233.  
  234.        :param str text: A string of text.
  235.        """
  236.         text_features = self.extract_features(text)
  237.         return self.classifier.classify(text_features)
  238.  
  239.     def accuracy(self, test_set, format=None):
  240.         """Compute the accuracy on a test set.
  241.  
  242.        :param test_set: A list of tuples of the form ``(text, label)``, or a
  243.            file pointer.
  244.        :param format: If ``test_set`` is a filename, the file format, e.g.
  245.            ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
  246.            file format.
  247.        """
  248.         if is_filelike(test_set):
  249.             test_data = self._read_data(test_set)
  250.         else:  # test_set is a list of tuples
  251.             test_data = test_set
  252.         test_features = [(self.extract_features(d), c) for d, c in test_data]
  253.         return nltk.classify.accuracy(self.classifier, test_features)
  254.  
  255.     def update(self, new_data, *args, **kwargs):
  256.         """Update the classifier with new training data and re-trains the
  257.        classifier.
  258.  
  259.        :param new_data: New data as a list of tuples of the form
  260.            ``(text, label)``.
  261.        """
  262.         self.train_set += new_data
  263.         self.train_features = [(self.extract_features(d), c)
  264.                                 for d, c in self.train_set]
  265.         try:
  266.             self.classifier = self.nltk_class.train(self.train_features,
  267.                                                     *args, **kwargs)
  268.         except AttributeError:  # Descendant has not defined nltk_class
  269.             raise ValueError("NLTKClassifier must have a nltk_class"
  270.                             " variable that is not None.")
  271.         return True
  272.  
  273.  
  274. class NaiveBayesClassifier(NLTKClassifier):
  275.     """A classifier based on the Naive Bayes algorithm, as implemented in
  276.    NLTK.
  277.  
  278.    :param train_set: The training set, either a list of tuples of the form
  279.        ``(text, classification)`` or a filename. ``text`` may be either
  280.        a string or an iterable.
  281.    :param feature_extractor: A feature extractor function that takes one or
  282.        two arguments: ``document`` and ``train_set``.
  283.    :param format: If ``train_set`` is a filename, the file format, e.g.
  284.        ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
  285.        file format.
  286.  
  287.    .. versionadded:: 0.6.0
  288.    """
  289.  
  290.     nltk_class = nltk.classify.NaiveBayesClassifier
  291.  
  292.     def prob_classify(self, text):
  293.         """Return the label probability distribution for classifying a string
  294.        of text.
  295.  
  296.        Example:
  297.        ::
  298.  
  299.            >>> classifier = NaiveBayesClassifier(train_data)
  300.            >>> prob_dist = classifier.prob_classify("I feel happy this morning.")
  301.            >>> prob_dist.max()
  302.            'positive'
  303.            >>> prob_dist.prob("positive")
  304.            0.7
  305.  
  306.        :rtype: nltk.probability.DictionaryProbDist
  307.        """
  308.         text_features = self.extract_features(text)
  309.         return self.classifier.prob_classify(text_features)
  310.  
  311.     def informative_features(self, *args, **kwargs):
  312.         """Return the most informative features as a list of tuples of the
  313.        form ``(feature_name, feature_value)``.
  314.  
  315.        :rtype: list
  316.        """
  317.         return self.classifier.most_informative_features(*args, **kwargs)
  318.  
  319.     def show_informative_features(self, *args, **kwargs):
  320.         """Displays a listing of the most informative features for this
  321.        classifier.
  322.  
  323.        :rtype: None
  324.        """
  325.         return self.classifier.show_most_informative_features(*args, **kwargs)
  326.  
  327.  
  328. class DecisionTreeClassifier(NLTKClassifier):
  329.     """A classifier based on the decision tree algorithm, as implemented in
  330.    NLTK.
  331.  
  332.    :param train_set: The training set, either a list of tuples of the form
  333.        ``(text, classification)`` or a filename. ``text`` may be either
  334.        a string or an iterable.
  335.    :param feature_extractor: A feature extractor function that takes one or
  336.        two arguments: ``document`` and ``train_set``.
  337.    :param format: If ``train_set`` is a filename, the file format, e.g.
  338.        ``"csv"`` or ``"json"``. If ``None``, will attempt to detect the
  339.        file format.
  340.  
  341.    .. versionadded:: 0.6.2
  342.    """
  343.  
  344.     nltk_class = nltk.classify.decisiontree.DecisionTreeClassifier
  345.  
  346.     def pretty_format(self, *args, **kwargs):
  347.         """Return a string containing a pretty-printed version of this decision
  348.        tree. Each line in the string corresponds to a single decision tree node
  349.        or leaf, and indentation is used to display the structure of the tree.
  350.  
  351.        :rtype: str
  352.        """
  353.         return self.classifier.pretty_format(*args, **kwargs)
  354.  
  355.     # Backwards-compat
  356.     pprint = pretty_format
  357.  
  358.     def pseudocode(self, *args, **kwargs):
  359.         """Return a string representation of this decision tree that expresses
  360.        the decisions it makes as a nested set of pseudocode if statements.
  361.  
  362.        :rtype: str
  363.        """
  364.         return self.classifier.pseudocode(*args, **kwargs)
  365.  
  366.  
  367. class PositiveNaiveBayesClassifier(NLTKClassifier):
  368.     """A variant of the Naive Bayes Classifier that performs binary
  369.    classification with partially-labeled training sets, i.e. when only
  370.    one class is labeled and the other is not. Assuming a prior distribution
  371.    on the two labels, uses the unlabeled set to estimate the frequencies of
  372.    the features.
  373.  
  374.    Example usage:
  375.    ::
  376.  
  377.        >>> from text.classifiers import PositiveNaiveBayesClassifier
  378.        >>> sports_sentences = ['The team dominated the game',
  379.        ...                   'They lost the ball',
  380.        ...                   'The game was intense',
  381.        ...                   'The goalkeeper catched the ball',
  382.        ...                   'The other team controlled the ball']
  383.        >>> various_sentences = ['The President did not comment',
  384.        ...                        'I lost the keys',
  385.        ...                        'The team won the game',
  386.        ...                        'Sara has two kids',
  387.        ...                        'The ball went off the court',
  388.        ...                        'They had the ball for the whole game',
  389.        ...                        'The show is over']
  390.        >>> classifier = PositiveNaiveBayesClassifier(positive_set=sports_sentences,
  391.        ...                                           unlabeled_set=various_sentences)
  392.        >>> classifier.classify("My team lost the game")
  393.        True
  394.        >>> classifier.classify("And now for something completely different.")
  395.        False
  396.  
  397.  
  398.    :param positive_set: A collection of strings that have the positive label.
  399.    :param unlabeled_set: A collection of unlabeled strings.
  400.    :param feature_extractor: A feature extractor function.
  401.    :param positive_prob_prior: A prior estimate of the probability of the
  402.        label ``True``.
  403.  
  404.    .. versionadded:: 0.7.0
  405.    """
  406.  
  407.     nltk_class = nltk.classify.PositiveNaiveBayesClassifier
  408.  
  409.     def __init__(self, positive_set, unlabeled_set,
  410.                 feature_extractor=contains_extractor,
  411.                 positive_prob_prior=0.5, **kwargs):
  412.         self.feature_extractor = feature_extractor
  413.         self.positive_set = positive_set
  414.         self.unlabeled_set = unlabeled_set
  415.         self.positive_features = [self.extract_features(d)
  416.                                     for d in self.positive_set]
  417.         self.unlabeled_features = [self.extract_features(d)
  418.                                     for d in self.unlabeled_set]
  419.         self.positive_prob_prior = positive_prob_prior
  420.  
  421.     def __repr__(self):
  422.         class_name = self.__class__.__name__
  423.         return "<{cls} trained on {n_pos} labeled and {n_unlabeled} unlabeled instances>"\
  424.                         .format(cls=class_name, n_pos=len(self.positive_set),
  425.                                 n_unlabeled=len(self.unlabeled_set))
  426.  
  427.     # Override
  428.     def train(self, *args, **kwargs):
  429.         """Train the classifier with a labeled and unlabeled feature sets and return
  430.        the classifier. Takes the same arguments as the wrapped NLTK class.
  431.        This method is implicitly called when calling ``classify`` or
  432.        ``accuracy`` methods and is included only to allow passing in arguments
  433.        to the ``train`` method of the wrapped NLTK class.
  434.  
  435.        :rtype: A classifier
  436.        """
  437.         self.classifier = self.nltk_class.train(self.positive_features,
  438.                                                 self.unlabeled_features,
  439.                                                 self.positive_prob_prior)
  440.         return self.classifier
  441.  
  442.     def update(self, new_positive_data=None,
  443.                new_unlabeled_data=None, positive_prob_prior=0.5,
  444.                *args, **kwargs):
  445.         """Update the classifier with new data and re-trains the
  446.        classifier.
  447.  
  448.        :param new_positive_data: List of new, labeled strings.
  449.        :param new_unlabeled_data: List of new, unlabeled strings.
  450.        """
  451.         self.positive_prob_prior = positive_prob_prior
  452.         if new_positive_data:
  453.             self.positive_set += new_positive_data
  454.             self.positive_features += [self.extract_features(d)
  455.                                             for d in new_positive_data]
  456.         if new_unlabeled_data:
  457.             self.unlabeled_set += new_unlabeled_data
  458.             self.unlabeled_features += [self.extract_features(d)
  459.                                             for d in new_unlabeled_data]
  460.         self.classifier = self.nltk_class.train(self.positive_features,
  461.                                                 self.unlabeled_features,
  462.                                                 self.positive_prob_prior,
  463.                                                 *args, **kwargs)
  464.         return True
  465.  
  466.  
  467. class MaxEntClassifier(NLTKClassifier):
  468.     __doc__ = nltk.classify.maxent.MaxentClassifier.__doc__
  469.     nltk_class = nltk.classify.maxent.MaxentClassifier
  470.  
  471.     def prob_classify(self, text):
  472.         """Return the label probability distribution for classifying a string
  473.        of text.
  474.  
  475.        Example:
  476.        ::
  477.  
  478.            >>> classifier = MaxEntClassifier(train_data)
  479.            >>> prob_dist = classifier.prob_classify("I feel happy this morning.")
  480.            >>> prob_dist.max()
  481.            'positive'
  482.            >>> prob_dist.prob("positive")
  483.            0.7
  484.  
  485.        :rtype: nltk.probability.DictionaryProbDist
  486.        """
  487.         feats = self.extract_features(text)
  488.         return self.classifier.prob_classify(feats)
  489.  
downloadclassifiers.py Source code - Download TextBlob Source code
Related Source Codes/Software:
monaco-editor - A browser based code editor 2017-01-07
grumpy - Grumpy is a Python to Go source code transcompiler... 2017-01-07
flinux - Foreign LINUX - Run unmodified Linux applications ... 2017-01-07
beeswithmachineguns - A utility for arming (creating) many bees (micro E... 2017-01-07
TopDeepLearning - A list of popular github projects related to deep ... 2017-01-07
yui3 - A library for building richly interactive web appl... 2017-01-07
KineticJS - KineticJS is an HTML5 Canvas JavaScript framework ... 2017-01-07
web-design-standards - Open source UI components and visual style guide f... 2017-01-07
Knuff - The debug application for Apple Push Notification ... 2017-01-07
RoundedImageView - A fast ImageView that supports rounded corners, ov... 2017-01-07
CRYENGINE - CRYENGINE is a powerful real-time game development... 2017-06-11
postal - 2017-06-11
reactide - Reactide is the first dedicated IDE for React web ... 2017-06-11
rkt - rkt is a pod-native container engine for Linux. It... 2017-06-11
uWebSockets - Tiny WebSockets https://for... 2017-06-11
realworld - TodoMVC for the RealWorld - Exemplary fullstack Me... 2017-06-11
goreplay - GoReplay is an open-source tool for capturing and ... 2017-06-10
pyenv - Simple Python version management 2017-06-10
redux-saga - An alternative side effect model for Redux apps ... 2017-06-10
angular-starter - 2017-06-10

 Back to top