当前位置 博文首页 > python提取内容关键词的方法

    python提取内容关键词的方法

    作者:上大王 时间:2021-07-27 17:45

    本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:

    一个非常高效的提取内容关键词的python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上分词功能,效果和英文是一样的。

    复制代码 代码如下:

    # coding=UTF-8
    import nltk
    from nltk.corpus import brown
    # This is a fast and simple noun phrase extractor (based on NLTK)
    # Feel free to use it, just keep a link back to this post
    # http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
    # Create by Shlomi Babluki
    # May, 2013
     
    # This is our fast Part of Speech tagger
    #############################################################################
    brown_train = brown.tagged_sents(categories='news')
    regexp_tagger = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
         (r'(-|:|;)$', ':'),
         (r'\'*$', 'MD'),
         (r'(The|the|A|a|An|an)$', 'AT'),
         (r'.*able$', 'JJ'),
         (r'^[A-Z].*$', 'NNP'),
         (r'.*ness$', 'NN'),
         (r'.*ly$', 'RB'),
         (r'.*s$', 'NNS'),
         (r'.*ing$', 'VBG'),
         (r'.*ed$', 'VBD'),
         (r'.*', 'NN')
    ])
    unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
    #############################################################################
    # This is our semi-CFG; Extend it according to your own needs
    #############################################################################
    cfg = {}
    cfg["NNP+NNP"] = "NNP"
    cfg["NN+NN"] = "NNI"
    cfg["NNI+NN"] = "NNI"
    cfg["JJ+JJ"] = "JJ"
    cfg["JJ+NN"] = "NNI"
    #############################################################################
    class NPExtractor(object):
        def __init__(self, sentence):
            self.sentence = sentence
        # Split the sentence into singlw words/tokens
        def tokenize_sentence(self, sentence):
            tokens = nltk.word_tokenize(sentence)
            return tokens
        # Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN")
        def normalize_tags(self, tagged):
            n_tagged = []
            for t in tagged:
                if t[1] == "NP-TL" or t[1] == "NP":
                    n_tagged.append((t[0], "NNP"))
                    continue
                if t[1].endswith("-TL"):
                    n_tagged.append((t[0], t[1][:-3]))
                    continue
                if t[1].endswith("S"):
                    n_tagged.append((t[0], t[1][:-1]))
                    continue
                n_tagged.append((t[0], t[1]))
            return n_tagged
        # Extract the main topics from the sentence
        def extract(self):
            tokens = self.tokenize_sentence(self.sentence)
            tags = self.normalize_tags(bigram_tagger.tag(tokens))
            merge = True
            while merge:
                merge = False
                for x in range(0, len(tags) - 1):
                    t1 = tags[x]
                    t2 = tags[x + 1]
                    key = "%s+%s" % (t1[1], t2[1])
                    value = cfg.get(key, '')
                    if value:
                        merge = True
                        tags.pop(x)
                        tags.pop(x)
                        match = "%s %s" % (t1[0], t2[0])
                        pos = value
                        tags.insert(x, (match, pos))
                        break
            matches = []
            for t in tags:
                if t[1] == "NNP" or t[1] == "NNI":
                #if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN":
                    matches.append(t[0])
            return matches
    # Main method, just run "python np_extractor.py"
    def main():
        sentence = "Swayy is a beautiful new dashboard for discovering and curating online content."
        np_extractor = NPExtractor(sentence)
        result = np_extractor.extract()
        print "This sentence is about: %s" % ", ".join(result)
    if __name__ == '__main__':
        main()

    希望本文所述对大家的Python程序设计有所帮助。

    jsjbwy