{"id":205,"date":"2020-08-14T09:51:46","date_gmt":"2020-08-14T00:51:46","guid":{"rendered":"http:\/\/localhost:8000\/?p=205"},"modified":"2021-01-16T14:20:33","modified_gmt":"2021-01-16T05:20:33","slug":"python-scraping-library","status":"publish","type":"post","link":"http:\/\/localhost:8000\/2020\/08\/python-scraping-library.html","title":{"rendered":"python\u3067\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3059\u308b\u969b\u306b\u5229\u7528\u3059\u308b\u30e9\u30a4\u30d6\u30e9\u30ea\u6bd4\u8f03"},"content":{"rendered":"
Python\u3067\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3092\u5b9f\u88c5\u3059\u308b\u6a5f\u4f1a\u304c\u3042\u3063\u305f\u306e\u3067\u3001\u305d\u306e\u4e2d\u3067\u5229\u7528\u3057\u305f\uff08\u3082\u3057\u304f\u306f\u6280\u8853\u691c\u8a3c\u3057\u305f\uff09\u30e9\u30a4\u30d6\u30e9\u30ea\u306b\u3064\u3044\u3066\u3001\u7279\u5fb4\u3084\u3069\u3046\u3044\u3046\u6642\u306b\u5229\u7528\u3059\u308b\u304b\u306b\u3064\u3044\u3066\u500b\u4eba\u7684\u306a\u898b\u89e3\u3092\u66f8\u3044\u3066\u3044\u3053\u3046\u3068\u601d\u3044\u307e\u3059\u3002
\n<\/p>\n
\u6307\u5b9a\u3057\u305fURL\u306b\u5bfe\u3057\u3066\u30ea\u30af\u30a8\u30b9\u30c8\u3092\u6295\u3052\u3066\u3001\u30ec\u30b9\u30dd\u30f3\u30b9\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u30b7\u30f3\u30d7\u30eb\u306a\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u3059\u3002
\nJavaScript\u5b9f\u884c\u3092\u5fc5\u8981\u3068\u3057\u306a\u3044\u3088\u3046\u306a\u9759\u7684\u306a\u30b5\u30a4\u30c8\u304b\u3089Response\u3092\u53d6\u5f97\u3059\u308b\u76ee\u7684\u3067\u3042\u308c\u3070\u3053\u308c\u3067\u5341\u5206\u3067\u3059\u3002<\/p>\n
urllib3.util.retry.Retry<\/code>\u3068\u4e00\u7dd2\u306b\u4f7f\u3048\u3070\u30ea\u30c8\u30e9\u30a4\u3082\u3067\u304d\u308b<\/li>\nresponse.text<\/code>\u3067\u672c\u6587\u3092\u53d6\u5f97\u3059\u308b\u969b\u3001charset\u3092\u3088\u3057\u306a\u306b\u89e3\u91c8\u3057\u3066decode\u3057\u3066\u6587\u5b57\u5217\u306b\u3057\u3066\u304f\u308c\u308b<\/li>\n- \u3067\u304d\u306a\u3044\u3053\u3068\n
\n- \u30ec\u30b9\u30dd\u30f3\u30b9\u3092\u89e3\u6790\u3057\u3066\u3001\u7279\u5b9a\u306eHtmlElement\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u306f\u3067\u304d\u306a\u3044 => beautifulsoup4\u3084lxml\u3067\u4f7f\u3046<\/li>\n
- \u30ec\u30b9\u30dd\u30f3\u30b9\u3092\u53d7\u3051\u53d6\u3063\u3066\u3001Javascript\u304c\u5b9f\u884c\u3055\u308c\u305f\u5f8c\u306eHTML\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u306f\u3067\u304d\u306a\u3044 => requests-html\u3084pyppeteer\u3092\u4f7f\u3046<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n
\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb<\/h3>\npip install requests\n# or\npoetry add requests<\/code><\/pre>\n\u5b9f\u88c5\u30b5\u30f3\u30d7\u30eb<\/h3>\n\u5358\u7d14\u306b\u30ea\u30af\u30a8\u30b9\u30c8<\/h4>\nimport requests\nfrom requests import Response\n\nresponse: Response = requests.get('http:\/\/quotes.toscrape.com\/')\nresponse.status_code\n# -> 200\nresponse.headers\n# -> {'Server': 'nginx\/1.14.0 (Ubuntu)', 'Date': 'Tue, 11 Aug 2020 13:00:00 GMT', 'Content-Type': 'text\/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Upstream': 'spidyquotes-master_web', 'Content-Encoding': 'gzip'}\nresponse.text\n# -> <!DOCTYPE html><html lang="en"><head>...<\/head><body><\/body><\/html><\/code><\/pre>\n\u30ea\u30c8\u30e9\u30a4\u3042\u308a\u3067\u30ea\u30af\u30a8\u30b9\u30c8<\/h4>\nimport requests\nfrom requests import Response, Session\nfrom requests.exceptions import RequestException, Timeout\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\nfrom typing import Dict\n\nwith Session() as session:\n url: str = 'http:\/\/quotes.toscrape.com\/'\n headers: Dict[str, str] = {'User-Agent': 'Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/69.0.3497.100 Safari\/537.36', 'Accept': '*\/*'} # noqa\n retries: Retry = Retry(total=5, # \u30ea\u30c8\u30e9\u30a4\u56de\u6570\n backoff_factor=3, # \u30ea\u30c8\u30e9\u30a4\u9593\u9694\u3002\u4f8b\u3048\u30702\u3092\u6307\u5b9a\u3059\u308b\u3068 2\u79d2 => 4\u79d2 => 8\u79d2 => 16\u79d2\u306e\u3088\u3046\u306b\u306a\u308b\n status_forcelist=[500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511], # \u30ea\u30c8\u30e9\u30a4\u5bfe\u8c61\u306e\u30b9\u30c6\u30fc\u30bf\u30b9\u30b3\u30fc\u30c9\n raise_on_status=False # `status_forcelist`\u306e\u30b9\u30c6\u30fc\u30bf\u30b9\u30b3\u30fc\u30c9\u3067\u30ea\u30c8\u30e9\u30a4\u7d42\u4e86\u3057\u305f\u5834\u5408\u306b\u30a8\u30e9\u30fcraise\u3059\u308b\u304b\u3069\u3046\u304b\u3002False\u3060\u3068Response\u3092\u8fd4\u3059\n )\n\n session.mount(url[0:url.find('\/\/') + 2], HTTPAdapter(max_retries=retries))\n\n try:\n response: Response = session.get(url, headers=headers)\n response.status_code\n # -> 200\n response.headers\n # -> {'Server': 'nginx\/1.14.0 (Ubuntu)', 'Date': 'Tue, 11 Aug 2020 13:00:00 GMT', 'Content-Type': 'text\/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Upstream': 'spidyquotes-master_web', 'Content-Encoding': 'gzip'}\n response.text # str\u3068\u3057\u3066\u53d6\u5f97\n # -> <!DOCTYPE html><html lang="en"><head>...<\/head><body><\/body><\/html>\n response.content # bytes\u3068\u3057\u3066\u53d6\u5f97\n # -> b'<!DOCTYPE html><html lang="en"><head>...<\/head><body><\/body><\/html>'\n\n except (RequestException, ConnectionError, Timeout) as e:\n print(f'possible error occurred. {e}')<\/code><\/pre>\n\n- \u30ea\u30c8\u30e9\u30a4\u8a2d\u5b9a\u306e\u8a73\u7d30\u306f\u3053\u3061\u3089<\/a>\u3092\u53c2\u7167<\/li>\n
- \u30b3\u30cd\u30af\u30b7\u30e7\u30f3\u30a8\u30e9\u30fc\u306a\u3069\u30cd\u30c3\u30c8\u30ef\u30fc\u30af\u95a2\u4fc2\u306e\u30a8\u30e9\u30fc\u306f\u30c7\u30d5\u30a9\u30eb\u30c8\u3067\u30ea\u30c8\u30e9\u30a4\u3055\u308c\u308b\u3002\u4eca\u56de\u306f\u305d\u308c\u4ee5\u5916\u306b
status_forcelist<\/code>\u3067500\u7cfb\u306e\u30b9\u30c6\u30fc\u30bf\u30b9\u30b3\u30fc\u30c9\u304c\u8fd4\u3063\u305f\u3089\u30ea\u30c8\u30e9\u30a4\u3059\u308b\u8a2d\u5b9a\u306b\u3057\u3066\u3042\u308b<\/li>\nraise_on_status<\/code>\u306bTrue\u3092\u8a2d\u5b9a\u3059\u308b\u3068status_forcelist<\/code>\u3067\u8a2d\u5b9a\u3057\u305f\u30b9\u30c6\u30fc\u30bf\u30b9\u306e\u5834\u5408\u3067\u3082\u30a8\u30e9\u30fc\u3092raise\u3059\u308b\u3002False\u306b\u3059\u308b\u3068Response\u304c\u8fd4\u308a\u3001status_code\u306b500\u7cfb\u304c\u8a2d\u5b9a\u3055\u308c\u308b<\/li>\n- Connection\u30a8\u30e9\u30fc\u306a\u3069\u306e\u60f3\u5b9a\u3055\u308c\u308b\u30a8\u30e9\u30fc\u306f\u3001
except (RequestException, ConnectionError, Timeout)<\/code>\u3067\u307e\u3068\u3081\u3066\u30ad\u30e3\u30c3\u30c1\u3057\u3066\u63e1\u308a\u3064\u3076\u3057\u3001\u305d\u308c\u4ee5\u5916\u306eRuntimeError\u306a\u3069\u306f\u305d\u306e\u307e\u307eraise\u3057\u3066\u3044\u308b<\/li>\n<\/ul>\nBeautiful Soup<\/a><\/h2>\nBeautiful Soup\u306fHTML\u3084XML\u3092\u89e3\u6790\u3057\u3066\u30c7\u30fc\u30bf\u3092\u62bd\u51fa\u3059\u308b\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u3059\u3002requests\u306a\u3069\u3067\u76ee\u7684\u306e\u30da\u30fc\u30b8\u306eHTML\u3092\u53d6\u5f97\u3057\u3066\u3001\u305d\u306eHTML\u3092\u89e3\u6790\u3059\u308b\u969b\u306b\u5229\u7528\u3059\u308b\u3053\u3068\u304c\u591a\u3044\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n
\u7279\u5fb4<\/h3>\n\n- HTML\u6587\u5b57\u5217\u3092\u89e3\u6790\u3057\u3066\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u3002\u5927\u304d\u304f\u4e09\u3064\u306e\u65b9\u6cd5\u304c\u3042\u308b\n
\n- \u30bf\u30b0\u3092\u305f\u3069\u3063\u3066\u3044\u304f\u65b9\u6cd5<\/li>\n
- find\u3001find_all\u306a\u3069\u3067\u5bfe\u8c61\u306e\u30bf\u30b0\u3092\u63a2\u3059\u65b9\u6cd5<\/li>\n
- CSS\u30bb\u30ec\u30af\u30bf\u30fc\u3092\u4f7f\u3063\u3066\u5bfe\u8c61\u306e\u30bf\u30b0\u3092\u53d6\u5f97\u3059\u308b\u65b9\u6cd5<\/li>\n<\/ul>\n<\/li>\n
- \u5bfe\u8c61\u306e\u30a8\u30ec\u30e1\u30f3\u30c8\u306e\u5c5e\u6027\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b<\/li>\n
- HTML\u30d1\u30fc\u30b5\u30fc\u306f\u3001\u30c7\u30d5\u30a9\u30eb\u30c8\u3060\u3068Python\u6a19\u6e96\u30e9\u30a4\u30d6\u30e9\u30ea\u306e\u30d1\u30fc\u30b5\u30fc\u3092\u4f7f\u3046\u304c\u3001HTML\u30d1\u30fc\u30b5\u30fc\u3092\u5909\u66f4\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u308b\n
\nlxml<\/code>\u306e\u65b9\u304c\u7206\u901f\u3089\u3057\u3044\u306e\u3067\u3001\u5927\u91cf\u306e\u30c7\u30fc\u30bf\u3092\u30d1\u30fc\u30b9\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u5834\u5408\u306flxml<\/code>\u3092\u4f7f\u3046\u65b9\u304c\u826f\u3055\u305d\u3046<\/li>\nsoup: BeautifulSoup = BeautifulSoup(response.text, 'lxml')<\/code> \u306e\u3088\u3046\u306b\u6307\u5b9a\u3059\u308b\u3060\u3051<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb<\/h3>\npip install beautifulsoup4\npip install lxml # HTML\u30d1\u30fc\u30b5\u30fc\u3092lxml\u306b\u5909\u66f4\u3059\u308b\u5834\u5408<\/code><\/pre>\n\u5b9f\u88c5\u30b5\u30f3\u30d7\u30eb<\/h3>\n
\u30bd\u30fc\u30b9\u30b3\u30fc\u30c9\u5185\u306e\u30b3\u30e1\u30f3\u30c8\u3092\u898b\u308c\u3070\u3060\u3044\u305f\u3044\u308f\u304b\u308b\u3068\u601d\u3044\u307e\u3059\u306e\u3067\u3001\u89e3\u8aac\u306f\u7701\u304d\u307e\u3059\u304c\u3001\u3053\u308c\u3050\u3089\u3044\u77e5\u3063\u3066\u304a\u3051\u3070\u5341\u5206\u306a\u611f\u3058\u304c\u3057\u3066\u307e\u3059\u3002<\/p>\n
import requests\nfrom requests import Response\nfrom bs4 import BeautifulSoup\n\ndef main():\n response: Response = requests.get('http:\/\/quotes.toscrape.com\/')\n soup: BeautifulSoup = BeautifulSoup(response.text)\n\n # == \u30bf\u30b0\u3092\u305f\u3069\u3063\u3066\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u53d6\u5f97\u3059\u308b ==\n soup.title\n # -> <title>Quotes to Scrape<\/title>\n soup.title.parent\n # -> <head><meta charset="utf-8"\/><title>Quotes to Scrape<\/title><link href="\/static\/bootstrap.min.css" rel="stylesheet"\/><link href="\/static\/main.css" rel="stylesheet"\/><\/head> # noqa\n soup.body.footer.div.p.a\n # -> <a href="https:\/\/www.goodreads.com\/quotes">GoodReads.com<\/a>\n\n # == find, find_all\u3067\u30bf\u30b0\u3092\u63a2\u3057\u3066\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u53d6\u5f97\u3059\u308b ==\n soup.find("title")\n # -> <title>Quotes to Scrape<\/title>\n soup.find_all("a")\n # -> [<a href="\/" style="text-decoration: none">Quotes to Scrape<\/a>, <a href="\/login">Login<\/a>, <a href="\/author\/Albert-Einstein">(about)<\/a>, ... , <a href="https:\/\/scrapinghub.com">Scrapinghub<\/a>] # noqa\n\n # == CSS\u30bb\u30ec\u30af\u30bf\u3067\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u53d6\u5f97\u3059\u308b ==\n soup.select("body .container .row .quote small.author")\n # -> [<small class="author" itemprop="author">Albert Einstein<\/small>, <small class="author" itemprop="author">J.K. Rowling<\/small>, ... , <small class="author" itemprop="author">Steve Martin<\/small>] # noqa\n soup.select("body .container .row .quote:first-child small.author")\n # -> [<small class="author" itemprop="author">Albert Einstein<\/small>]\n soup.select("body .container .row div.quote:last-of-type small.author")\n # -> [<small class="author" itemprop="author">Steve Martin<\/small>]\n\n # == \u30a8\u30ec\u30e1\u30f3\u30c8\u306e\u5c5e\u6027\u60c5\u5831\u3092\u53d6\u5f97\u3059\u308b ==\n soup.title.string\n # -> Quotes to Scrape\n soup.title.name\n # -> title\n soup.body.footer.div.p.a["href"]\n # -> https:\/\/www.goodreads.com\/quotes\n\n # == \u6b63\u898f\u8868\u73fe\u3067\u30c6\u30ad\u30b9\u30c8\u3092\u62bd\u51fa\u3059\u308b ==\n import re\n soup.find_all(text=re.compile("^\u201cA "))\n # -> ["\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d", '\u201cA day without sunshine is like, you know, night.\u201d'] # noqa\n\nif __name__ == "__main__":\n main()<\/code><\/pre>\npyppeteer<\/a><\/h2>\npyppeteer\u306f\u3001npm\u30e2\u30b8\u30e5\u30fc\u30eb\u3067\u3042\u308bpuppeteer<\/a>\u3092python\u306b\u79fb\u690d\u3057\u305f\u3082\u306e\u3067\u3059\u3002<\/p>\n\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\uff08chromium\uff09\u3092\u958b\u3044\u3066\u3001\u5b9f\u969b\u306b\u30d6\u30e9\u30a6\u30b6\u5185\u3067\u30da\u30fc\u30b8\u3092\u8aad\u307f\u8fbc\u3080\u306e\u3067JavaScript\u304c\u5b9f\u884c\u3055\u308c\u307e\u3059\u3002\u307e\u305f\u3001CSS\u30bb\u30ec\u30af\u30bf\u3067\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u63a2\u3057\u3066\u30af\u30ea\u30c3\u30af\u3057\u305f\u308a\u3001\u753b\u9762\u9077\u79fb\u3092\u5f85\u3063\u305f\u308a\u3001\u6307\u5b9a\u3057\u305fJavaScript\u30b3\u30fc\u30c9\u3092\u5b9f\u884c\u3057\u305f\u308a\u3059\u308b\u3053\u3068\u304c\u51fa\u6765\u307e\u3059\u3002<\/p>\n
JavaScript\u304c\u5fc5\u8981\u3067\u3001\u30a8\u30ec\u30e1\u30f3\u30c8\u30af\u30ea\u30c3\u30af\u306b\u3088\u308b\u753b\u9762\u9077\u79fb\u3092\u3057\u305f\u3044\u30b1\u30fc\u30b9\u3067\u306f\u3053\u3061\u3089\u3092\u5229\u7528\u3059\u308b\u306e\u304c\u826f\u3044\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n
\u3061\u306a\u307f\u306b\u3001\u30aa\u30ea\u30b8\u30ca\u30eb\u306egithub\u30ea\u30dd\u30b8\u30c8\u30ea<\/a>\u306f\u73fe\u5728Archived\u306b\u306a\u3063\u3066\u304a\u308a\u3001\u3053\u3061\u3089\u306e\u30ea\u30dd\u30b8\u30c8\u30ea<\/a>\u3067\u958b\u767a\u304c\u7d99\u7d9a\u3055\u308c\u3066\u3044\u308b\u3088\u3046\u3067\u3059\u3002
\n\u307b\u307c\u540c\u7b49\u306e\u3053\u3068\u304c\u3067\u304d\u308bSelenium\u3082\u3042\u308b\u306e\u3067\u3059\u304c\u3001JavaScript\u4e16\u754c\u3067\u306fpuppeteer\u304c\u5727\u5012\u7684\u306a\u4eba\u6c17\u306a\u306e\u3067\u3001\u3042\u307e\u308a\u6df1\u304f\u8003\u3048\u305apython\u3067\u3082pyppeteer\u3092\u63a1\u7528\u3057\u307e\u3057\u305f\u3002 <\/p>\n\u7279\u5fb4<\/h3>\n\n- \u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u4e0a\u3067\u5b9f\u969b\u306b\u753b\u9762\u3092\u8aad\u307f\u8fbc\u3093\u3067\u3001\u753b\u9762\u4e0a\u3067\u30dc\u30bf\u30f3\u30af\u30ea\u30c3\u30af=>\u753b\u9762\u9077\u79fb\u3092\u7e70\u308a\u8fd4\u3059\u3053\u3068\u3082\u3067\u304d\u308b<\/li>\n
- \u753b\u9762\u9077\u79fb\u304c\u5b8c\u4e86\u3092\u3001
\u4e00\u5b9a\u6642\u9593\u5f85\u3064<\/code> or \u7279\u5b9a\u306e\u8981\u7d20\u306e\u51fa\u73fe\u3092\u5f85\u3064<\/code> or \u6307\u5b9a\u3057\u305fJavaScript\u95a2\u6570\u304cTrue\u3092\u8fd4\u3059\u307e\u3067\u5f85\u3064<\/code> \u306e\u3088\u3046\u306b\u8907\u6570\u306e\u65b9\u6cd5\u3067\u5f85\u3061\u5408\u308f\u305b\u308b\u3053\u3068\u304c\u3067\u304d\u308b<\/li>\n- \u30bb\u30ec\u30af\u30bf\u3092\u4f7f\u3063\u3066\u7279\u5b9a\u306eHTML\u30a8\u30ec\u30e1\u30f3\u30c8\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u3082\u3067\u304d\u308b<\/li>\n
- HTTP\u30ec\u30b9\u30dd\u30f3\u30b9\uff08\u30d8\u30c3\u30c0\u30fc\u3001\u30b9\u30c6\u30fc\u30bf\u30b9\u30b3\u30fc\u30c9\u306a\u3069\uff09\u3092\u53d6\u5f97\u3067\u304d\u308b<\/li>\n
- \u30de\u30eb\u30c1\u30d7\u30ed\u30bb\u30b9\u3067\u5b9f\u884c\u3082\u3067\u304d\u308b\u3057\u3001Linux\u4e0a\u3067\u3082\u52d5\u304b\u305b\u308b<\/li>\n
- \u30de\u30a4\u30ca\u30b9\u8981\u7d20\n
\n- chromium\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u5b9f\u884c\u3059\u308b\u306e\u3067\u3001Linux\u4e0a\u3067\u52d5\u304b\u3059\u6642\u306b\u591a\u5c11\u30cf\u30de\u308a\u30dd\u30a4\u30f3\u30c8\u304c\u3042\u308b\uff08\u89e3\u6c7a\u53ef\u80fd\uff09<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n
\u8a73\u7d30<\/h3>\n
\u9577\u304f\u306a\u3063\u305f\u306e\u3067\u3001\u5225\u3067\u8a18\u4e8b\u3092\u8d77\u3053\u3057\u307e\u3057\u305f\u3002\u8208\u5473\u304c\u3042\u308c\u3070\u3053\u3061\u3089\u3092\u53c2\u7167\u304f\u3060\u3055\u3044\u3002
\npyppeteer\u306e\u4f7f\u3044\u65b9<\/a><\/p>\nrequests-html<\/a><\/h2>\nrequests-html\u3001requests\u30fbPyppeteer\u30fbPyQuery\u30fbBeautifulSoup\u3092\u30e9\u30c3\u30d7\u3057\u3066\u4e00\u3064\u306eAPI\u3068\u3057\u3066\u63d0\u4f9b\u3057\u3066\u304f\u308c\u3066\u3044\u308b\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u3059\u3002<\/p>\n
\u306a\u3093\u3067\u3082\u3067\u304d\u308b\u3088\u3046\u3067\u3059\u304c\u3001\u7279\u5fb4\u306e\u30de\u30a4\u30ca\u30b9\u8981\u7d20\u306e\u6240\u306b\u66f8\u3044\u3066\u3044\u308b\u3088\u3046\u306b\u3001\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u3063\u3066\u304c\u3063\u3064\u308a\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3059\u308b\u3088\u3046\u306a\u30b1\u30fc\u30b9\u3067\u306f\u6b63\u76f4\u3082\u306e\u305f\u308a\u306a\u3044\u3067\u3059\u3002\u305d\u306e\u7528\u9014\u306a\u3089pyppeteer\u306e\u65b9\u304c\u3044\u3044\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n
requests+Beautiful Soup\u306e\u4ee3\u66ff\u3068\u3057\u3066\u306f\u30a2\u30ea\u3060\u3051\u3069\u3001\u305d\u306e\u7528\u9014\u306a\u3089\u5143\u3005\u96e3\u3057\u3044\u3068\u3053\u308d\u306f\u306a\u3044\u3057\u3001\u3042\u3048\u3066\u4e57\u308a\u63db\u3048\u308b\u5fc5\u8981\u306f\u306a\u3044\u304b\u306a\u3041\u3068\u3044\u3046\u5370\u8c61\u3067\u3057\u305f\u3002<\/p>\n
\u7279\u5fb4<\/h3>\n\n- \u30d5\u30ebJavaScript\u30b5\u30dd\u30fc\u30c8\n
\n- requests\u3067\u53d6\u5f97\u3057\u305fResponse\u3092\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u63cf\u753b\u3057\u3066\u3001JavaScript\u3092\u5b9f\u884c\u3059\u308b<\/li>\n<\/ul>\n<\/li>\n
- CSS Selectors\u3092\u4f7f\u3063\u305f\u30a8\u30ec\u30e1\u30f3\u30c8\u9078\u629e<\/li>\n
- \u30b3\u30cd\u30af\u30b7\u30e7\u30f3\u30d7\u30fc\u30eb\u304a\u3088\u3073\u6c38\u7d9a\u7684\u306acookie\u30b5\u30dd\u30fc\u30c8<\/li>\n
- \u30de\u30a4\u30ca\u30b9\u8981\u7d20\n
\n- Reponse\u3092\u53d6\u5f97\u3059\u308b\u969b\u306b\u30ea\u30c8\u30e9\u30a4\u6a5f\u80fd\u304c\u306a\u3044\uff08\u516c\u5f0f\u306e\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u304b\u3089\u767a\u898b\u3067\u304d\u305a\uff09<\/li>\n
- \u30dc\u30bf\u30f3\u30af\u30ea\u30c3\u30af\u3092\u3057\u305f\u3044\u6642\u3001
element.click()<\/code>\u307f\u305f\u3044\u306a\u7c21\u5358\u306a\u65b9\u6cd5\u306f\u63d0\u4f9b\u3055\u308c\u3066\u304a\u3089\u305a\u3001\u30dc\u30bf\u30f3\u30af\u30ea\u30c3\u30af\u3059\u308bjavascript\u3092render\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b<\/li>\n- \u753b\u9762\u9077\u79fb\u3092\u5f85\u3064\u65b9\u6cd5\u304c\u3001\u6642\u9593\u6307\u5b9a\u3060\u3051\uff08pyppeteer\u306f\u3001selector\u304c\u73fe\u308c\u308b\u307e\u3067\u3068\u304b\u3001JavaScript\u95a2\u6570\u304ctrue\u3092\u8fd4\u3059\u307e\u3067\u3001\u3068\u304b\u3067\u304d\u308b\uff09<\/li>\n
- \u753b\u9762\u9077\u79fb\u3092\u4f34\u3046\u30dc\u30bf\u30f3\u30af\u30ea\u30c3\u30af\u3092\u4e8c\u56de\u5b9f\u884c\u3059\u308b\u65b9\u6cd5\u304c\u306a\u3044\uff08\u81ea\u5206\u306e\u7406\u89e3\u4e0d\u8db3\u304b\u3082\u3057\u308c\u307e\u305b\u3093\uff09<\/li>\n
- \u30d6\u30e9\u30a6\u30b6\u3092\u8868\u793a\u3057\u3066\u5b9f\u884c\u3059\u308b\u65b9\u6cd5\u304c\u306a\u3044\u306e\u3067\u30c7\u30d0\u30c3\u30b0\u3057\u306b\u304f\u3044\uff08\u81ea\u5206\u306e\u7406\u89e3\u4e0d\u8db3\u304b\u3082\u3057\u308c\u307e\u305b\u3093\uff09<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n
\u5b9f\u88c5\u30b5\u30f3\u30d7\u30eb<\/h3>\n\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u308f\u306a\u3044\u30b1\u30fc\u30b9<\/h4>\n
\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u308f\u305a\u4ee5\u4e0b\u306e\u30b1\u30fc\u30b9\u3092\u5b9f\u884c\u3057\u3066\u307f\u307e\u3057\u305f\u3002<\/p>\n
\n- Response\u30aa\u30d6\u30b8\u30a7\u30af\u30c8\u3092\u53d6\u5f97\u3059\u308b<\/li>\n
- CSS\u30bb\u30ec\u30af\u30bf\u30fc\u3092\u4f7f\u3063\u3066\u30a8\u30ec\u30e1\u30f3\u30c8\u62bd\u51fa\u3059\u308b<\/li>\n
- \u30a8\u30ec\u30e1\u30f3\u30c8\u306e\u5c5e\u6027\u60c5\u5831\u3092\u53d6\u5f97\u3059\u308b<\/li>\n
- \u30da\u30fc\u30b8\u5185\u306e\u30ea\u30f3\u30af\u3092\u5168\u3066\u53d6\u5f97\u3059\u308b<\/li>\n<\/ul>\n
from requests_html import HTMLSession\nfrom requests import Response\n\ndef main():\n session: HTMLSession = HTMLSession()\n response: Response = session.get('http:\/\/quotes.toscrape.com\/')\n\n # == Response\u30aa\u30d6\u30b8\u30a7\u30af\u30c8\u3092\u53d6\u5f97\u3059\u308b ==\n response.status_code\n # -> 200\n response.headers\n # -> {'Server': 'nginx\/1.14.0 (Ubuntu)', 'Date': 'Tue, 11 Aug 2020 13:11:10 GMT', 'Content-Type': 'text\/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Upstream': 'spidyquotes-master_web', 'X-Content-Encoding-Over-Network': 'gzip'} # noqa\n response.text\n # -> <!DOCTYPE html><html lang="en"><head>...<\/head><body>...<\/body><\/html>\n\n # == CSS\u30bb\u30ec\u30af\u30bf\u30fc\u3092\u4f7f\u3063\u3066\u30a8\u30ec\u30e1\u30f3\u30c8\u62bd\u51fa\u3059\u308b ==\n # \u5168\u3066\u62bd\u51fa\n response.html.find("body .container .row .quote small.author")\n # -> [<Element 'small' class=('author',) itemprop='author'>, <Element 'small' class=('author',) itemprop='author'>, <Element 'small' class=('author',) itemprop='author'>, ... , <Element 'small' class=('author',) itemprop='author'>] # noqa\n response.html.find("body .container .row div.quote:first-child small.author")\n # -> [<Element 'small' class=('author',) itemprop='author'>]\n response.html.find("body .container .row div.quote:last-of-type small.author")\n # -> [<Element 'small' class=('author',) itemprop='author'>]\n\n # \u6700\u521d\u306e1\u4ef6\u62bd\u51fa\n response.html.find("body .container .row .quote small.author", first=True)\n # -> <Element 'small' class=('author',) itemprop='author'>\n\n # == \u30a8\u30ec\u30e1\u30f3\u30c8\u306e\u5c5e\u6027\u60c5\u5831\u3092\u53d6\u5f97\u3059\u308b ==\n response.html.find("body .container .row .quote a", first=True).attrs['href']\n # -> \/author\/Albert-Einstein\n response.html.find("body .container .row .quote small.author", first=True).text\n # -> Albert Einstein\n\n # == \u30da\u30fc\u30b8\u5185\u306e\u30ea\u30f3\u30af\u3092\u5168\u3066\u53d6\u5f97\u3059\u308b ==\n response.html.absolute_links\n # -> {'http:\/\/quotes.toscrape.com\/author\/Jane-Austen', 'http:\/\/quotes.toscrape.com\/author\/Steve-Martin', 'http:\/\/quotes.toscrape.com\/tag\/obvious\/page\/1\/', ... , 'http:\/\/quotes.toscrape.com\/tag\/friendship\/'} # noqa\n response.html.links\n # -> {'\/tag\/live\/page\/1\/', '\/author\/Jane-Austen', '\/tag\/aliteracy\/page\/1\/', '\/page\/2\/', '\/tag\/choices\/page\/1\/', '\/tag\/reading\/', '\/', ... , '\/author\/Thomas-A-Edison'} # noqa\n\nif __name__ == "__main__":\n main()<\/code><\/pre>\n\u7279\u306b\u96e3\u3057\u3044\u70b9\u306f\u3042\u308a\u307e\u305b\u3093\u3067\u3057\u305f\u304c\u3001session.get()<\/code>\u3059\u308b\u969b\u306bretry\u3059\u308b\u6a5f\u80fd\u304c\u63d0\u4f9b\u3055\u308c\u3066\u306a\u306e\u304c\u5c11\u3057\u6b8b\u5ff5\u3067\u3001Retry\u3057\u305f\u304b\u3063\u305f\u3089\u81ea\u524d\u3067\u5bfe\u5fdc\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u305d\u3046\u3067\u3059\u3002<\/p>\n\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u3046\u30b1\u30fc\u30b9<\/h4>\n
\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u3063\u3066\u3001\u4ee5\u4e0b\u3092\u8a66\u3057\u307e\u3057\u305f\u3002<\/p>\n
\n- Response HTML\u3092\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u8aad\u307f\u8fbc\u3080<\/li>\n
- CSS\u30bb\u30ec\u30af\u30bf\u3067\u8981\u7d20\u3092\u62bd\u51fa\u3059\u308b<\/li>\n
- JavaScript\u3092\u5b9f\u884c\u3057\u3066\u3001\u30dc\u30bf\u30f3\u3092\u30af\u30ea\u30c3\u30af\u3057\u3066\u753b\u9762\u9077\u79fb\u3059\u308b<\/li>\n
- \u753b\u9762\u9077\u79fb\u5f8c\u306eHTML\u304b\u3089\u8981\u7d20\u3092\u62bd\u51fa\u3059\u308b<\/li>\n<\/ul>\n
from requests_html import HTMLSession\nfrom requests import Response\n\ndef main():\n session: HTMLSession = HTMLSession()\n response: Response = session.get('https:\/\/qiita.com\/')\n\n # == \u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u8aad\u307f\u8fbc\u307f ==\n # \u30ec\u30b9\u30dd\u30f3\u30b9HTML\u3092\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u8aad\u307f\u8fbc\u307f5\u79d2\u5f85\u3064\n response.html.render(sleep=5)\n\n # == \u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u63cf\u753b\u3055\u308c\u308c\u305fHTML\u3092\u53d6\u5f97 ==\n response.html.raw_html\n # -> b'<!DOCTYPE html><html><head><meta charset="utf-8"><title>Qiita<\/title>...<\/iframe><\/div><\/div><\/body><\/html>'\n\n # == \u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3067\u63cf\u753b\u3055\u308c\u308c\u305fHTML\u304b\u3089CSS\u30bb\u30ec\u30af\u30bf\u3067\u8981\u7d20\u3092\u691c\u7d22 ==\n response.html.find('.p-home_aside div[data-hyperapp-app="TagRanking"] .ra-TagList_content .ra-Tag_name a', first=True)\n # -> <Element 'a' href='\/tags\/python'>\n\n # == \u30b9\u30af\u30ea\u30d7\u30c8\u3092\u5b9f\u884c ==\n # \u30e6\u30fc\u30b6\u30e9\u30f3\u30ad\u30f3\u30b0\u9031\u9593\u4e00\u4f4d\u3092\u30af\u30ea\u30c3\u30af\u3057\u3066\u753b\u9762\u9077\u79fb\n response.html.render(script="""\n () => { document.querySelector('div[data-hyperapp-app="UserRanking"] .ra-UserList_content .ra-User_name > a').click() }\n """, sleep=5)\n\n # \u753b\u9762\u9077\u79fb\u5f8c\u306eHTML\u304b\u3089\u8981\u7d20\uff08\u30b3\u30f3\u30c8\u30ea\u30d3\u30e5\u30fc\u30b7\u30e7\u30f3\u306e\u4ef6\u6570\uff09\u3092\u62bd\u51fa\n response.html.find('a[href$="contributions"] > p[class^="UserCounterList__UserCounterItemCount"]', first=True).text\n # -> 60598\n\nif __name__ == "__main__":\n main()<\/code><\/pre>\n\u30cf\u30de\u3063\u305f\u30dd\u30a4\u30f3\u30c8\u3082\u5171\u6709\u3057\u3066\u304a\u304d\u307e\u3059\u3002<\/p>\n
\u307e\u305a\u3001response.html.render()<\/code>\u306e\u969b\u306bsleep<\/code>\u3092\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002
\n\u3053\u308c\u3092\u8a2d\u5b9a\u3057\u306a\u3044\u3068pyppeteer.errors.NetworkError: Protocol error (Runtime.callFunctionOn): Cannot find context with specified id<\/code>\u304c\u767a\u751f\u3057\u307e\u3059\u3002<\/p>\n\u6b21\u306b\u3001\u3053\u3061\u3089\u304c\u81f4\u547d\u7684\u306a\u306e\u3067\u3059\u304c\u3001\u4e00\u56deJavaScript\u3067\u30dc\u30bf\u30f3\u30af\u30ea\u30c3\u30af\u3057\u3066\u753b\u9762\u9077\u79fb\u3057\u305f\u5f8c\u306b\u3082\u3046\u4e00\u5ea6JavaScript\u3067\u753b\u9762\u9077\u79fb\u3057\u3088\u3046\u3068\u601d\u3044\u307e\u3057\u305f\u304c\u3001\u3046\u307e\u304f\u52d5\u304d\u307e\u305b\u3093\u3067\u3057\u305f\u3002<\/p>\n
response.html.render(script="""\n () => { document.querySelector('a[href$="contributions"]').click() }\n """, sleep=5)<\/code><\/pre>\n\u3053\u3061\u3089\u306e\u51e6\u7406\u3092\u4e0a\u8a18\u30b5\u30f3\u30d7\u30eb\u306e\u6700\u5f8c\u306b\u8ffd\u52a0\u3057\u305f\u5834\u5408\u3001\u672c\u6765\u30b3\u30f3\u30c8\u30ea\u30d3\u30e5\u30fc\u30b7\u30e7\u30f3\u4e00\u89a7\u304c\u8868\u793a\u3055\u308c\u308b\u3053\u3068\u304c\u671f\u5f85\u3055\u308c\u308b\u306e\u3067\u3059\u304c\u3001pyppeteer.errors.ElementHandleError: Evaluation failed: TypeError: Cannot read property 'click' of null<\/code>\u3068\u3044\u3046\u30a8\u30e9\u30fc\u767a\u751f\u3057\u307e\u3057\u305f\u3002\u3069\u3046\u3084\u3089\u3001\u5bfe\u8c61\u306e\u8981\u7d20\u304c\u5b58\u5728\u3057\u306a\u3044\u3053\u3068\u306b\u306a\u3063\u3066\u3044\u308b\u3088\u3046\u3067\u3059\u3002
\n\u8a66\u3057\u306b\u30da\u30fc\u30b8\u306e\u30bf\u30a4\u30c8\u30eb\u3092JavaScript\u3067\u51fa\u529b\u3057\u3066\u307f\u305f\u3068\u3053\u308d\u3001\u753b\u9762\u9077\u79fb\u524d\u306eHTML\u306e\u5024\u304c\u30ed\u30b0\u306b\u51fa\u529b\u3055\u308c\u307e\u3057\u305f\u3002\u3069\u3046\u3084\u3089\u3001JavaScript\u3092render\u3059\u308b\u306e\u306f\u6700\u521d\u306brender\u3057\u305fHTML\u306b\u5bfe\u3057\u3066\u306e\u307f\u3063\u307d\u3044\u306e\u3067\u3059\u3002\uff08\u30c9\u30ad\u30e5\u30e1\u30f3\u30c8\u3092\u8aad\u3093\u3067\u307f\u305f\u306e\u3067\u3059\u304c\u3053\u306e\u90e8\u5206\u306b\u95a2\u3059\u308b\u8aac\u660e\u306f\u767a\u898b\u3067\u304d\u3066\u307e\u305b\u3093\u306e\u3067\u3001\u5c11\u3057\u602a\u3057\u3044\u3067\u3059\uff09\u3002\u81ea\u5206\u306e\u8abf\u67fb\u3067\u306f\u3001requests_html\u3067\u306f\u30d8\u30c3\u30c9\u30ec\u30b9\u30d6\u30e9\u30a6\u30b6\u3092\u4f7f\u3063\u3066\u3001\u5b9f\u969b\u306e\u753b\u9762\u3092\u63cf\u753b\u3057\u3064\u3064\u4e8c\u56de\u753b\u9762\u9077\u79fb\u3092\u884c\u3046\u65b9\u6cd5\u304c\u306a\u3055\u305d\u3046\u3067\u3059\u3002<\/p>\nscrapy<\/h2>\n
Scrapy<\/a> \u306f\u3001Web\u30b5\u30a4\u30c8\u3092\u30af\u30ed\u30fc\u30eb\u3057\u3001\u30da\u30fc\u30b8\u304b\u3089\u69cb\u9020\u5316\u30c7\u30fc\u30bf\u3092\u62bd\u51fa\u3059\u308b\u305f\u3081\u306b\u4f7f\u7528\u3055\u308c\u308bWeb\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u30d5\u30ec\u30fc\u30e0\u30ef\u30fc\u30af\u3067\u3059\u3002\u9759\u7684\u306a\u30b5\u30a4\u30c8\u306e\u30af\u30ed\u30fc\u30ea\u30f3\u30b0\u3067\u3042\u308c\u3070\u4e00\u901a\u308a\u306a\u3093\u3067\u3082\u3067\u304d\u308b\u3051\u3069\u3001\u5b66\u7fd2\u30b3\u30b9\u30c8\u9ad8\u3081\u3068\u3044\u3046\u5370\u8c61\u3067\u3059\u3002<\/p>\n\u500b\u4eba\u7684\u306b\u306f\u3001LinkExtractor\u3084SitemapSpider\u306e\u3088\u3046\u306b\u76ee\u7684\u306b\u3070\u3063\u3061\u308a\u5408\u81f4\u3059\u308b\u30b1\u30fc\u30b9\u3067\u306fScrapy\u3092\u5229\u7528\u3059\u308b\u3051\u3069\u3001\u305d\u3046\u3058\u3083\u306a\u3051\u308c\u3070\u3001requests+Beautiful Soup\u3084requests-html\u3092\u5229\u7528\u3059\u308b\u3068\u601d\u3044\u307e\u3059\u3002<\/p>\n
\u7279\u5fb4<\/h3>\n\n- \u30ea\u30af\u30a8\u30b9\u30c8\u3092\u9001\u4fe1\u3057\u3066\u3001\u30ec\u30b9\u30dd\u30f3\u30b9\u3092\u89e3\u6790\u30fb\u52a0\u5de5\u3057\u3066\u3001\u30a2\u30a6\u30c8\u30d7\u30c3\u30c8\u3059\u308b\u307e\u3067\u306e\u4e00\u9023\u306e\u51e6\u7406\u3092\u307e\u3068\u3081\u3066\u7d71\u5408\u7684\u306b\u7ba1\u7406\u3067\u304d\u308b<\/li>\n
- Item Pipeline\u3092\u4f7f\u3048\u3070\u3001\u3042\u307e\u308a\u8003\u3048\u306a\u304f\u3066\u3082\u3001WEB\u30ea\u30af\u30a8\u30b9\u30c8\u95a2\u4fc2\u3068\u7d50\u679c\u306e\u89e3\u6790\u30fb\u52a0\u5de5\u95a2\u4fc2\u3092\u5f79\u5272\u5206\u62c5\u3057\u3066\u5b9f\u88c5\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\uff08\u305f\u3060\u3057\u5b9f\u88c5\u91cf\u306f\u5897\u3048\u308b\u3057\u3001Pipeline\u5b9a\u7fa9\u3082\u5fc5\u8981\u306a\u306e\u3067\u521d\u898b\u3067\u4f55\u3084\u3063\u3066\u308b\u304b\u306f\u308f\u304b\u308a\u306b\u304f\u304f\u306a\u308b\uff09<\/li>\n
- LinkExtractor\uff08\u30ea\u30f3\u30af\u3092\u305f\u3069\u3063\u3066URL\u3092\u5217\u6319\uff09\u3084SitemapSpider\uff08sitemap.xml\u3084robots.txt\u304b\u3089URL\u3092\u5217\u6319\uff09\u306a\u3069\u3001\u826f\u304f\u3042\u308b\u6a5f\u80fd\u3092\u30c7\u30d5\u30a9\u30eb\u30c8\u3067\u63d0\u4f9b\u3057\u3066\u304f\u308c\u3066\u3044\u308b<\/li>\n
- \u30de\u30a4\u30ca\u30b9\u8981\u7d20\n
\n- \u5b66\u7fd2\u30b3\u30b9\u30c8\u306f\u9ad8\u3081\u3002\u30bd\u30fc\u30b9\u3060\u3051\u8ffd\u3063\u3066\u3082\u7406\u89e3\u3067\u304d\u306a\u3044\u3053\u3068\u304c\u591a\u3005\u3042\u308b<\/li>\n
- LinkExtractor\u3084SitemapSpider\u306f\u4fbf\u5229\u3060\u3051\u3069\u3001\u9014\u4e2d\u307e\u3067\u3067\u4e2d\u65ad\u3057\u3066\u518d\u958b\u3059\u308b\u3053\u3068\u306f\u3067\u304d\u306a\u3044\u3002\u6570\u4e07URL\u3042\u308b\u3088\u3046\u306a\u30b5\u30a4\u30c8\u3060\u3068\u4f55\u65e5\u3082\u5b9f\u884c\u306b\u304b\u304b\u308b\u304c\u9014\u4e2d\u3067\u30a8\u30e9\u30fc\u304c\u767a\u751f\u3057\u305f\u3089\u6700\u521d\u304b\u3089\u3084\u308a\u76f4\u3057<\/li>\n
- LinkExtractor\u3084SitemapSpider\u306f\u30b5\u30fc\u30d0\u5185\u306e\u4e26\u5217\u5316\u3082\u305f\u3076\u3093\u3046\u307e\u304f\u52d5\u3044\u3066\u306a\u3044\uff08CONCURRENT_REQUESTS<\/a>\u3042\u305f\u308a\u306e\u8a2d\u5b9a\u3092\u8272\u3005\u5909\u3048\u3066\u3082\u7279\u306b\u65e9\u304f\u306a\u3089\u306a\u3044\uff09<\/li>\n
- \u30de\u30eb\u30c1\u30b5\u30fc\u30d0\u3067\u306e\u4e26\u5217\u5316\u306f\u3067\u304d\u306a\u3044<\/li>\n<\/ul>\n<\/li>\n<\/ul>\n
\u8a73\u7d30<\/h3>\n
\u9577\u304f\u306a\u3063\u305f\u306e\u3067\u3001\u8a18\u4e8b\u3092\u8d77\u3053\u3057\u307e\u3057\u305f\u3002\u8208\u5473\u304c\u3042\u308c\u3070\u53c2\u7167\u304f\u3060\u3055\u3044\u3002
\nScrapy\u306e\u4f7f\u3044\u65b9<\/a><\/p>\n\u53c2\u8003\u30ea\u30dd\u30b8\u30c8\u30ea<\/h2>\n
\u4eca\u56de\u691c\u8a3c\u3057\u305f\u30bd\u30fc\u30b9\u30b3\u30fc\u30c9\u306f\u5168\u3066github\u306b\u3042\u3052\u3066\u3042\u308a\u307e\u3059\u306e\u3067\u3001\u5fc5\u8981\u304c\u3042\u308c\u3070\u3053\u3061\u3089<\/a>\u3092\u53c2\u7167\u304f\u3060\u3055\u3044\u3002<\/p>\n\u7d50\u5c40\u3069\u308c\u3092\u4f7f\u3046\uff1f<\/h2>\n
python\u3067\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3059\u308b\u969b\u306e\u30e9\u30a4\u30d6\u30e9\u30ea\u306e\u4f7f\u3044\u5206\u3051\u3067\u3059\u304c\u3001\u500b\u4eba\u7684\u306b\u306f<\/p>\n
\n- JavaScript\u306e\u5b9f\u884c\u304c\u5fc5\u8981 =>
pyppeteer<\/code><\/li>\n- JavaScript\u306e\u5b9f\u884c\u304c\u4e0d\u8981\n
\n- \u30ea\u30f3\u30af\u3092\u8fbf\u3063\u3066or\u30b5\u30a4\u30c8\u30de\u30c3\u30d7\u304b\u3089URL\u62bd\u51fa\uff08\u305f\u3060\u3057\u5bfe\u8c61\u30da\u30fc\u30b8\u6570\u304c1\u4e07\u4ef6\u3050\u3089\u3044\u307e\u3067\uff1f\uff09 =>
Scrapy<\/code><\/li>\n- \u305d\u308c\u4ee5\u5916 =>
requests-html<\/code> or requests+Beautiful Soup<\/code><\/li>\n<\/ul>\n<\/li>\n<\/ul>\n\u3068\u3044\u3046\u611f\u3058\u304c\u3044\u3044\u3093\u3058\u3083\u306a\u3044\u304b\u3068\u601d\u3044\u307e\u3057\u305f\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"
Python\u3067\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3092\u5b9f\u88c5\u3059\u308b\u6a5f\u4f1a\u304c\u3042\u3063\u305f\u306e\u3067\u3001\u305d\u306e\u4e2d\u3067\u5229\u7528\u3057\u305f\uff08\u3082\u3057\u304f\u306f\u6280\u8853\u691c\u8a3c\u3057\u305f\uff09\u30e9\u30a4\u30d6\u30e9\u30ea\u306b\u3064\u3044\u3066\u3001\u7279\u5fb4\u3084\u3069\u3046\u3044\u3046\u6642\u306b\u5229\u7528\u3059\u308b\u304b\u306b\u3064\u3044\u3066\u500b\u4eba\u7684\u306a\u898b\u89e3\u3092\u66f8\u3044\u3066\u3044\u3053\u3046\u3068\u601d\u3044\u307e\u3059\u3002 requests \u6307\u5b9a\u3057\u305fURL\u306b\u5bfe\u3057\u3066\u30ea\u30af\u30a8\u30b9\u30c8\u3092\u6295\u3052\u3066\u3001\u30ec\u30b9\u30dd\u30f3\u30b9\u3092\u53d6\u5f97\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u30b7\u30f3\u30d7\u30eb\u306a\u30e9\u30a4\u30d6\u30e9\u30ea\u3067\u3059\u3002 JavaScript\u5b9f\u884c\u3092\u5fc5\u8981\u3068\u3057\u306a\u3044\u3088\u3046\u306a\u9759\u7684\u306a\u30b5\u30a4\u30c8\u304b\u3089Response\u3092\u53d6\u5f97\u3059\u308b\u76ee\u7684\u3067\u3042\u308c\u3070\u3053\u308c\u3067\u5341\u5206\u3067\u3059\u3002 \u7279\u5fb4 HTTP\u30ec\u30b9\u30dd\u30f3\u30b9\uff08\u30d8\u30c3\u30c0\u30fc\u3001\u30b9\u30c6\u30fc\u30bf\u30b9\u30b3\u30fc\u30c9\u3001HTML\uff09\u3092\u53d6\u5f97\u3067\u304d\u308b \u30ea\u30af\u30a8\u30b9\u30c8\u30d8\u30c3\u30c0\u30fc\u3084\u30af\u30c3\u30ad\u30fc\u3092\u6307\u5b9a\u3057\u3066\u30ea\u30af\u30a8\u30b9\u30c8\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u308b \u30ea\u30c0\u30a4\u30ec\u30af\u30c8\uff08301\u3084302\u306a\u3069\uff09\u3082\u3057\u3066\u304f\u308c\u308b urllib3.util.retry.Retry\u3068\u4e00\u7dd2\u306b\u4f7f\u3048\u3070\u30ea\u30c8\u30e9\u30a4\u3082\u3067\u304d\u308b response.text\u3067\u672c\u6587\u3092\u53d6\u5f97\u3059\u308b\u969b\u3001charset\u3092\u3088\u3057\u306a\u306b\u89e3\u91c8\u3057\u3066decode\u3057\u3066\u6587\u5b57\u5217 <\/span>Continue Reading<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[8,7,9],"tags":[],"_links":{"self":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/205"}],"collection":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/comments?post=205"}],"version-history":[{"count":1,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/205\/revisions"}],"predecessor-version":[{"id":207,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/posts\/205\/revisions\/207"}],"wp:attachment":[{"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/media?parent=205"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/categories?post=205"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/localhost:8000\/wp-json\/wp\/v2\/tags?post=205"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}