import os
import re
import requests
import time
from bs4 import BeautifulSoup
staSitLst = {
# jreast # # https://www.jreast.co.jp/estation/stations/ # </td><td style="width:99px;"><A href="http://www.jreast.co.jp/estation/stations/1039.html" target = "_blank" onclick="openMap('1039');return false;">東京</A> # # https://www.jreast.co.jp/estation/stations/1039.html # <img src="img/floormap/1039_1f.png" alt="東京駅 1F構内図"> # # https://www.jreast.co.jp/estation/stations/img/floormap/1039_1f.png "jreast": {
"itemEnable": False, "ocvSitUrl": "https://www.jreast.co.jp/", "staLstUrl": "https://www.jreast.co.jp/estation/stations", "reImgLstUrl": "^(http://www\.jreast\.co\.jp/estation/stations/[0-9]+\..+)$", "reImgLstId": "\\1", "fmtImgLstUrl": "{}", "reStaImgUrl": "^(img/floormap/.+\..+)$", "reStaImgId": "\\1", "fmtStaImgUrl": "https://www.jreast.co.jp/estation/stations/{}", "reStaImgUrlFilBas": "^.*img/floormap/([^.]+)\.[^.]+.*$", "reStaImgUrlFilExt": "^.*img/floormap/[^.]+\.([^.]+)$", "regStaImgUrlFilBas": "\\1", "regStaImgUrlFilExt": "\\1", "imgUrl": "{}/{}", "z": "z" }, # keikyu # # https://www.keikyu.co.jp/train-info/kakueki/index.html # <area coords="206,48,426,88" href="/train-info/kakueki/KK01.html" alt="品川" data-imagemap-rollover-url="/assets/image/train-info/kakueki/index_img_route_KK01.png"> # # https://www.keikyu.co.jp/train-info/kakueki/KK01.html # <img src="/assets/image/train-info/kakueki/KK01/img_03.png" alt="品川駅の設備図です"> # # https://www.keikyu.co.jp/assets/image/train-info/kakueki/KK01/img_03.png "keikyu": {
"itemEnable": False, "ocvSitUrl": "https://www.keikyu.co.jp/", "staLstUrl": "https://www.keikyu.co.jp/train-info/kakueki/index.html", "reImgLstUrl": "^(/train-info/kakueki/KK[0-9][0-9]\.htm.*)", "reImgLstId": "\\1", "fmtImgLstUrl": "https://www.keikyu.co.jp{}", "reStaImgUrl": "^(/assets/image/train-info/kakueki/KK[0-9][0-9]/img_03\..+)$", "reStaImgId": "\\1", "fmtStaImgUrl": "https://www.keikyu.co.jp{}", "reStaImgUrlFilBas": "^.*/assets/image/train-info/kakueki/(KK[0-9][0-9])/img_03.[^.]+.*$", "reStaImgUrlFilExt": "^.*/assets/image/train-info/kakueki/KK[0-9][0-9]/img_03.([^.]+).*$", "regStaImgUrlFilBas": "\\1", "regStaImgUrlFilExt": "\\1", "imgUrl": "{}/{}", "z": "z" }, # tokyometro # # https://www.tokyometro.jp/station/index03.html # <a href="./akihabara/index.html"> # # https://www.tokyometro.jp/station/akihabara/yardmap/index.html#adjacent # <p class="v2_yardmapImg"><img src="../../yardmap_img/_station_%E7%A7%8B%E8%91%89%E5%8E%9F_yardmap_images_yardmap.jpg" alt="" class="v2_js-yardmapImg"></p> # <p class="v2_yardmapImg"><img src="../../yardmap_img/figure_yardmap_ayase.gif" alt="" class="v2_js-yardmapImg"></p> # # https://www.tokyometro.jp/station/yardmap_img/_station_%E7%A7%8B%E8%91%89%E5%8E%9F_yardmap_images_yardmap.jpg # https://www.tokyometro.jp/station/yardmap_img/figure_yardmap_ayase.gif "tokyometro": {
"itemEnable": False, "ocvSitUrl": "https://www.tokyometro.jp/", "staLstUrl": "https://www.tokyometro.jp/station/index03.html", "reImgLstUrl": "^\./(.+)/index\.html$", "reImgLstId": "\\1", "fmtImgLstUrl": "https://www.tokyometro.jp/station/{}/yardmap/index.html", "reStaImgUrl": "^\.\./\.\./(yardmap_img/.+\..+)$", "reStaImgId": "\\1", "fmtStaImgUrl": "https://www.tokyometro.jp/station/{}", "reStaImgUrlFilBas": "^.*https://www.tokyometro.jp/station/([^/]+)/yardmap/index.html.*$", "reStaImgUrlFilExt": "^.*/yardmap_img/.+\.([^.]+).*$", "regStaImgUrlFilBas": "\\1", "regStaImgUrlFilExt": "\\1", "imgUrl": "{}/{}", "z": "z" }
}
ocvItvlt = 10
def getStaImg(ocvSit, imgLstUrl, staImgUrl):
reStaImgUrlFilBas = staSitLst[ocvSit]["reStaImgUrlFilBas"]
regStaImgUrlFilBas = staSitLst[ocvSit]["regStaImgUrlFilBas"]
reStaImgUrlFilExt = staSitLst[ocvSit]["reStaImgUrlFilExt"]
regStaImgUrlFilExt = staSitLst[ocvSit]["regStaImgUrlFilExt"]
imgFilBas = re.sub(reStaImgUrlFilBas, regStaImgUrlFilBas, imgLstUrl + "," + staImgUrl)
imgFilExt = re.sub(reStaImgUrlFilExt, regStaImgUrlFilExt, imgLstUrl + "," + staImgUrl)
fmtImgDirPth = "/".join(["data", "_img", "{}", "floormap"])
fmtImgFilPth = "/".join([fmtImgDirPth, "{}.{}"])
imgDirPth = fmtImgDirPth.format(ocvSit)
imgFilPth = fmtImgFilPth.format(ocvSit, imgFilBas, imgFilExt)
print("# staImgUrl=" + staImgUrl)
print("# imgFilPth=" + imgFilPth)
os.makedirs(imgDirPth, exist_ok=True)
staImg = requests.get(staImgUrl)
with open(imgFilPth, "wb") as file:
file.write(staImg.content)
time.sleep(ocvItvlt)
def getImgLst(ocvSit, imgLstUrl):
reStaImgUrl = staSitLst[ocvSit]["reStaImgUrl"]
reStaImgId = staSitLst[ocvSit]["reStaImgId"]
fmtStaImgUrl = staSitLst[ocvSit]["fmtStaImgUrl"]
staImgPag = requests.get(imgLstUrl)
soup = BeautifulSoup(staImgPag.text, "lxml")
staImgImgTags = soup.find_all("img", src=re.compile(reStaImgUrl))
for staImgImgTag in staImgImgTags:
staImgId = re.sub(reStaImgUrl, reStaImgId, staImgImgTag["src"])
staImgUrl = fmtStaImgUrl.format(staImgId)
print("# staImgId=" + staImgId + ", staImgUrl=" + staImgUrl)
getStaImg(ocvSit, imgLstUrl, staImgUrl)
def getStaLst(ocvSit, staLstUrl):
reImgLstUrl = staSitLst[ocvSit]["reImgLstUrl"]
reImgLstId = staSitLst[ocvSit]["reImgLstId"]
fmtImgLstUrl = staSitLst[ocvSit]["fmtImgLstUrl"]
staLstPag = requests.get(staLstUrl)
soup = BeautifulSoup(staLstPag.text, "lxml")
staPagATags = soup.find_all("a", href=re.compile(reImgLstUrl))
for staPagATag in staPagATags:
staPagId = re.sub(reImgLstUrl, reImgLstId, staPagATag["href"])
imgLstUrl = fmtImgLstUrl.format(staPagId)
print("# staPagId=" + staPagId + ", imgLstUrl=" + imgLstUrl)
getImgLst(ocvSit, imgLstUrl)
def getSitLst(staSitLst):
for staSitNam in staSitLst.keys():
if staSitLst[staSitNam]["itemEnable"] == True:
staLstUrl = staSitLst[staSitNam]["staLstUrl"]
print("# staSitNam=" + staSitNam + ", staLstUrl=" + staLstUrl)
getStaLst(staSitNam, staLstUrl)
def getSit():
getSitLst(staSitLst)
getSit()
2019年8月5日月曜日
getSitImg.py
登録:
コメントの投稿 (Atom)
0 件のコメント:
コメントを投稿