自然语言处理(Natural Language Processing,NLP)是计算机科学、人工智能、语言学等交叉领域的一项技术,其目的是让计算机能够识别、理解、分析和生成人类自然语言的信息。Python作为当前较为流行的编程语言之一,提供了丰富的用于自然语言处理的库和工具。本文将介绍Python在自然语言处理方面的应用,包括文本处理、情感分析、主题建模等内容。
import nltk text = "Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, \ and artificial intelligence concerned with the interactions between computers and human language, in particular \ how to program computers to process and analyze large amounts of natural language data." tokens = nltk.word_tokenize(text) print(tokens)
import spacy nlp = spacy.load("en_core_web_sm") text = "Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, \ and artificial intelligence concerned with the interactions between computers and human language, in particular \ how to program computers to process and analyze large amounts of natural language data." doc = nlp(text) for token in doc: print(token.text)
from textblob import TextBlob text = "I love this product, it's great!" blob = TextBlob(text) sentiment = blob.sentiment.polarity if sentiment > 0: print("Positive sentiment") elif sentiment < 0: print("Negative sentiment") else: print("Neutral sentiment")
import gensim from gensim import corpora from pprint import pprint doc_list = ["Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, \ and artificial intelligence concerned with the interactions between computers and human language, in particular \ how to program computers to process and analyze large amounts of natural language data.", "TextBlob is a Python (2 and 3) library for processing textual data. It provides a simple API for diving into \ common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment \ analysis, classification, translation, and more.", "Gensim is an open-source library for unsupervised topic modeling and natural language processing, using modern \ statistical machine learning. Gensim is designed to handle large text collections using data streaming and \ incremental online algorithms, which differentiates it from most other machine learning software packages that \ target only in-memory processing."] stop_list = set('for a of the and to in \n is with'.split()) texts = [[word for word in document.lower().split() if word not in stop_list] for document in doc_list] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True) pprint(lda_model.print_topics())