{"id":300,"date":"2020-02-24T11:04:36","date_gmt":"2020-02-24T02:04:36","guid":{"rendered":"http:\/\/localhost:8000\/?p=300"},"modified":"2021-01-17T11:06:33","modified_gmt":"2021-01-17T02:06:33","slug":"nlp-preprocessing","status":"publish","type":"post","link":"http:\/\/localhost:8000\/2020\/02\/nlp-preprocessing.html","title":{"rendered":"NLP\u306e\u524d\u51e6\u7406"},"content":{"rendered":"

\u4ed5\u4e8b\u3067\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\uff08NLP\uff09\u306b\u5c11\u3057\u53d6\u308a\u7d44\u3080\u5fc5\u8981\u304c\u51fa\u3066\u304d\u305f\u306e\u3067\u3001\u81ea\u5206\u306a\u308a\u306e\u7406\u89e3\u3092Tips\u3068\u3057\u3066\u307e\u3068\u3081\u3066\u3044\u3053\u3046\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n

\u5c0f\u6587\u5b57\u5316<\/h2>\n

\u6587\u5b57\u306e\u6b63\u898f\u5316\u3068\u3044\u3046\u610f\u5473\u3067\u3001\u30a2\u30eb\u30d5\u30a1\u30d9\u30c3\u30c8\u3092\u5c0f\u6587\u5b57\u5316\u3057\u307e\u3059\u3002\u65e5\u672c\u8a9e\u306e\u5834\u5408\u306f\u3001\u534a\u89d2\u3092\u5168\u89d2\u306b\u7d71\u4e00\u3059\u308b\u3001\u306a\u3069\u306e\u5bfe\u5fdc\u3082\u5fc5\u8981\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n

sentences: List[str] = ['I  have  a pen', 'That  is a window']\nprint(sentences)\n# -> ['I  have  a pen', 'That  is a window']\n\nlower_sentences: List[str] = list(\n    sentence.lower() for sentence in sentences\n)\nprint(lower_sentences)\n# -> ['i  have  a pen', 'that  is a window']<\/code><\/pre>\n

tokenize<\/h2>\n

\u6587\u66f8\u3092\u30c8\u30fc\u30af\u30f3\uff08\u6700\u5c0f\u5358\u4f4d\u306e\u6587\u5b57\u3084\u6587\u5b57\u5217\uff09\u306b\u5206\u5272\u3057\u307e\u3059\u3002\u5206\u5272\u306b\u306f\u3001nltk\uff08\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\u306e\u30c4\u30fc\u30eb\u30ad\u30c3\u30c8\u3092\u63d0\u4f9b\u3059\u308b\u30e9\u30a4\u30d6\u30e9\u30ea\uff09\u3092\u5229\u7528\u3057\u307e\u3059\u3002\u82f1\u8a9e\u306a\u306e\u3067punkt<\/code>\u30d1\u30c3\u30b1\u30fc\u30b8\u3092\u4f7f\u3063\u3066\u3044\u307e\u3059\u3002
\n\u6587\u66f8\u306e\u30ea\u30b9\u30c8\u3092\u30c8\u30fc\u30af\u30f3\u306b\u5206\u5272\u3059\u308b\u30b5\u30f3\u30d7\u30eb\u306f\u4ee5\u4e0b\u306e\u901a\u308a\u3067\u3059\u3002<\/p>\n

import nltk\nnltk.download('punkt')\nfrom typing import List\nsentences: List[str] = ['i  have  a pen', 'that  is a window']\nprint(sentences)\n# -> ['i  have  a pen', 'that  is a window']\n\nwords_list: List[List[str]] = list(\n    nltk.tokenize.word_tokenize(sentence) for sentence in sentences\n)\nprint(words_list)\n# -> [['i', 'have', 'a', 'pen'], ['that', 'is', 'a', 'window']]<\/code><\/pre>\n

stop-words\u9664\u5916<\/h2>\n

stop-word\u3068\u306f\u3001the<\/code>,a<\/code>,for<\/code>,of<\/code>\u306e\u3088\u3046\u306a\u4e00\u822c\u8a9e\u306a\u3069\u3001\u5206\u6790\u306b\u5f71\u97ff\u3092\u4e0e\u3048\u306a\u3044\u5358\u8a9e\u306e\u3053\u3068\u3067\u3001\u3053\u308c\u3089\u3092\u9664\u5916\u3059\u308b\u3053\u3068\u306b\u3088\u3063\u3066\u5f8c\u7d9a\u51e6\u7406\u306e\u8a08\u7b97\u91cf\u3092\u4e0b\u3052\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002nltk<\/code>\u3067stopwords<\/code>\u304c\u5b9a\u7fa9\u3055\u308c\u3066\u3044\u308b\u306e\u3067\u3001\u305d\u3061\u3089\u3092\u5229\u7528\u3059\u308b\u30b5\u30f3\u30d7\u30eb\u3092\u8a18\u8f09\u3057\u307e\u3059\u3002<\/p>\n

from typing import List\nimport nltk\nnltk.download('stopwords')\n\nwords_list: List[List[str]] = [['i', 'have', 'a', 'pen'], ['that', 'is', 'a', 'window']]\nstopwords: List[str] = nltk.corpus.stopwords.words('english')\nprint(stopwords)\n# -> ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', ...\uff08\u7701\u7565\uff09]\n\nnormalized_words_list: List[List[str]] = list(\n    list(word for word in words if word not in stopwords) for words in words_list\n)\nprint(normalized_words_list)\n# -> [['pen'], ['window']]<\/code><\/pre>\n

\u4e0a\u8a18\u4ee5\u5916\u306b\u3001\u8a18\u53f7\u30841\u6587\u5b57\u306e\u82f1\u6570\u5b57\u3092\u9664\u5916\u3057\u305f\u3044\u30b1\u30fc\u30b9\u3082\u3042\u308b\u3068\u601d\u3044\u307e\u3059\u3002\u305d\u306e\u5834\u5408\u306fstring<\/code>\u30d1\u30c3\u30b1\u30fc\u30b8\u3092\u4f7f\u3063\u3066\u6587\u5b57\u3092\u53d6\u5f97\u3057\u3066\u3001\u305d\u308c\u3089\u3092\u4f7f\u3063\u3066\u9664\u5916\u3059\u308b\u3068\u697d\u3060\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n

from typing import List\nimport string\nexclude_words: List[str] = list(string.ascii_lowercase) + list(string.digits) + list(string.punctuation)\nprint(exclude_words)\n# -> ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '\/', ':', ';', '<', '=', '>', '?', '@', '[', '\\\\', ']', '^', '_', '`', '{', '|', '}', '~']<\/code><\/pre>\n

\u30b7\u30ce\u30cb\u30e0\u306e\u9069\u7528<\/h2>\n

TBD<\/p>\n

\u30b9\u30c6\u30df\u30f3\u30b0\uff08stemming\uff09<\/h2>\n

\u30b9\u30c6\u30df\u30f3\u30b0\u3068\u306f\u3001\u8a9e\u5c3e\u304c\u5909\u5316\u3059\u308b\u5358\u8a9e\u306e\u8a9e\u5e79\u90e8\u5206\u3092\u629c\u304d\u51fa\u3059\u51e6\u7406\u306e\u3053\u3068\u3092\u8a00\u3044\u307e\u3059\u3002\u305d\u3053\u3060\u3051\u629c\u304d\u51fa\u3059\u3068\u4eba\u9593\u306b\u306f\u9055\u548c\u611f\u304c\u3042\u308b\u6587\u5b57\u5217\u306b\u306a\u308b\u3053\u3068\u3082\u3042\u308a\u307e\u3059\u3002
\nnltk<\/code>\u306ePorterStemmer<\/code>\u3092\u4f7f\u3046\u30b5\u30f3\u30d7\u30eb\u3092\u8a18\u8f09\u3057\u307e\u3057\u305f\u3002mechanical <\/code>\u304cmechan<\/code>\u306b\u306a\u3063\u305f\u308a\u3001pencils<\/code>\u304cpencil<\/code>\u306b\u5909\u63db\u3055\u308c\u308b\u4e00\u65b9\u3067\u3001went<\/code>\u306fgo<\/code>\u306b\u306f\u306a\u3089\u306a\u3044\u3088\u3046\u3067\u3059\uff08\u8a9e\u5e79\u90e8\u5206\u3092\u629c\u304d\u51fa\u3057\u305f\u3060\u3051\u306a\u306e\u3067\uff09\u3002<\/p>\n

import nltk\nfrom nltk.stem.porter import PorterStemmer\nps: PorterStemmer = PorterStemmer()\nwords: List[str] = ['mechanical', 'pencil', 'go', 'went', 'goes', 'pencils']\nprint(words)\n# -> ['mechanical', 'pencil', 'go', 'went', 'goes', 'pencils']\n\nstemmed_words: List[str] = list(ps.stem(word) for word in words)\nprint(stemmed_words)\n# -> ['mechan', 'pencil', 'go', 'went', 'goe', 'pencil']<\/code><\/pre>\n

\u30b3\u30fc\u30d1\u30b9\u5316<\/h2>\n

\u30b3\u30fc\u30d1\u30b9\u3068\u3044\u3046\u306e\u306f\u3001\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\u3092\u884c\u3044\u3084\u3059\u3044\u5f62\u306b\u3001\u69cb\u9020\u5316\u3055\u308c\u305f\u30c7\u30fc\u30bf\u306e\u3053\u3068\u3092\u3044\u3046\u3089\u3057\u3044\u3067\u3059\u3002\u81ea\u5206\u304c\u3053\u308c\u307e\u3067\u898b\u305f\u3082\u306e\u306f\u30b3\u30fc\u30d1\u30b9\u5316\uff1d\u30d9\u30af\u30c8\u30eb\u5316\u306e\u3088\u3046\u3067\u3057\u305f\u3002
\nwords_list<\/code>\u3092\u3001\u5404\u8981\u7d20\u304c\u5358\u8a9e\u3001\u305d\u306e\u5024\u304c\u51fa\u73fe\u56de\u6570\u3092\u8868\u3059\u30d9\u30af\u30c8\u30eb\u306b\u5909\u63db\u3059\u308b\u30b5\u30f3\u30d7\u30eb\u306f\u4ee5\u4e0b\u306e\u901a\u308a\u3067\u3059\u3002<\/p>\n

from typing import List, Tuple\nfrom gensim import corpora\n\nwords_list: List[List[str]] = [['i', 'have', 'a', 'red', 'pen', 'and', 'a', 'blue', 'pen'], ['you', 'like', 'red']]\nprint(words_list)\n# -> [['i', 'have', 'a', 'red', 'pen', 'and', 'a', 'blue', 'pen'], ['you', 'like', 'red']]\n\ndictionary: corpora.Dictionary = corpora.Dictionary(words_list)\nprint(dictionary.token2id)\n# -> {'a': 0, 'and': 1, 'blue': 2, 'have': 3, 'i': 4, 'pen': 5, 'red': 6, 'like': 7, 'you': 8}\n\ncorpus: List[List[Tuple[int, int]]] = list(map(dictionary.doc2bow, words_list)) # \u30d9\u30af\u30c8\u30eb\u5316\nprint(corpus)\n# -> [[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1)], [(6, 1), (7, 1), (8, 1)]]<\/code><\/pre>\n

dictionary<\/code>\u306f\u5358\u8a9e\u6587\u5b57\u5217 -> ID\uff08\u9023\u756a\u3002int\uff09\u306b\u5909\u63db\u3059\u308b\u8f9e\u66f8\u3067\u3059\u3002\u6587\u5b57\u5217\u3092\u305d\u306e\u307e\u307e\u6271\u3046\u3068\u30e1\u30e2\u30ea\u3092\u98df\u3046\u306e\u3067int\u306b\u5909\u63db\u3057\u3066\u3044\u308b\u306e\u3060\u3068\u3082\u601d\u3044\u307e\u3059\u3002
\n\u3053\u3053\u3067\u4f5c\u6210\u3057\u305fcopus<\/code>\u306f\u3001\u5217\u304c\u5358\u8a9e\uff08\u6b63\u78ba\u306b\u306f\u5358\u8a9eID\uff09\u884c\u304c\u6587\u66f8\u3092\u610f\u5473\u3059\u308b\u884c\u5217\u3068\u8003\u3048\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u30021\u884c\uff1d1\u6587\u66f8\u30d9\u30af\u30c8\u30eb\u3067\u3059\u3002
\n\u306a\u304adoc2bow<\/code>\u306ebow<\/code>\u306fbag-of-words<\/code>\u306e\u7565\u3067\u3001\u6587\u66f8\u3092bag-of-words<\/code>\u306b\u5909\u63db\u3059\u308b\u95a2\u6570\u306b\u306a\u308a\u307e\u3059\u3002bag-of-words<\/code>\u306f\u76f4\u8a33\u3059\u308b\u3068\u5358\u8a9e\u306e\u888b<\/code>\u3067\u3001\u5358\u8a9e\u3092\u888b\u306b\u307e\u3068\u3081\u3066\uff08\u3064\u307e\u308a\u5358\u8a9e\u3067\u30b0\u30eb\u30fc\u30d4\u30f3\u30b0\u3057\u3066\uff09\u3001\u305d\u306e\u888b\u306e\u4e2d\u306b\u30e1\u30bf\u60c5\u5831\u3092\u6301\u305f\u305b\u308b\u3068\u3044\u3046\u30a4\u30e1\u30fc\u30b8\u306e\u3088\u3046\u3067\u3059\u3002\u3053\u306e\u30b1\u30fc\u30b9\u3067\u306f\u30e1\u30bf\u60c5\u5831\u3068\u3057\u3066\u5358\u8a9e\u306e\u51fa\u73fe\u56de\u6570\u3092\u4fdd\u6301\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"

\u4ed5\u4e8b\u3067\u81ea\u7136\u8a00\u8a9e\u51e6\u7406\uff08NLP\uff09\u306b\u5c11\u3057\u53d6\u308a\u7d44\u3080\u5fc5\u8981\u304c\u51fa\u3066\u304d\u305f\u306e\u3067\u3001\u81ea\u5206\u306a\u308a\u306e\u7406\u89e3\u3092Tips\u3068\u3057\u3066\u307e\u3068\u3081\u3066\u3044\u3053\u3046\u3068\u601d\u3044\u307e\u3059\u3002 \u5c0f\u6587\u5b57\u5316 \u6587\u5b57\u306e\u6b63\u898f\u5316\u3068\u3044\u3046\u610f\u5473\u3067\u3001\u30a2\u30eb\u30d5\u30a1\u30d9\u30c3\u30c8\u3092\u5c0f\u6587\u5b57\u5316\u3057\u307e\u3059\u3002\u65e5\u672c\u8a9e\u306e\u5834\u5408\u306f\u3001\u534a\u89d2\u3092\u5168\u89d2\u306b\u7d71\u4e00\u3059\u308b\u3001\u306a\u3069\u306e\u5bfe\u5fdc\u3082\u5fc5\u8981\u3068\u601d\u3044\u307e\u3059\u3002 sentences: List[str] = ['I have a pen', 'That is a window'] print(sentences) # -> ['I have a pen', 'That is a window'] lower_sentences: List[str] = list( sentence.lower() for sentence in sentences ) print(lower_sentences) # -> <\/span>Continue Reading<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[30],"tags":[],"_links":{"self":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/300"}],"collection":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/comments?post=300"}],"version-history":[{"count":1,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/300\/revisions"}],"predecessor-version":[{"id":301,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/300\/revisions\/301"}],"wp:attachment":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/media?parent=300"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/categories?post=300"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/tags?post=300"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}