2019年8月5日月曜日

getSitImg.py

import os
import re
import requests
import time
from bs4 import BeautifulSoup

staSitLst = {
    # jreast    #    # https://www.jreast.co.jp/estation/stations/    #     </td><td style="width:99px;"><A href="http://www.jreast.co.jp/estation/stations/1039.html" target = "_blank" onclick="openMap('1039');return false;">東京</A>    #    # https://www.jreast.co.jp/estation/stations/1039.html    #                  <img src="img/floormap/1039_1f.png" alt="東京駅 1F構内図">    #    # https://www.jreast.co.jp/estation/stations/img/floormap/1039_1f.png    "jreast": {
        "itemEnable": False,        "ocvSitUrl": "https://www.jreast.co.jp/",        "staLstUrl": "https://www.jreast.co.jp/estation/stations",        "reImgLstUrl": "^(http://www\.jreast\.co\.jp/estation/stations/[0-9]+\..+)$",        "reImgLstId": "\\1",        "fmtImgLstUrl": "{}",        "reStaImgUrl": "^(img/floormap/.+\..+)$",        "reStaImgId": "\\1",        "fmtStaImgUrl": "https://www.jreast.co.jp/estation/stations/{}",        "reStaImgUrlFilBas": "^.*img/floormap/([^.]+)\.[^.]+.*$",        "reStaImgUrlFilExt": "^.*img/floormap/[^.]+\.([^.]+)$",        "regStaImgUrlFilBas": "\\1",        "regStaImgUrlFilExt": "\\1",        "imgUrl": "{}/{}",        "z": "z"    },    # keikyu    #    # https://www.keikyu.co.jp/train-info/kakueki/index.html    #     <area coords="206,48,426,88" href="/train-info/kakueki/KK01.html" alt="品川" data-imagemap-rollover-url="/assets/image/train-info/kakueki/index_img_route_KK01.png">    #    # https://www.keikyu.co.jp/train-info/kakueki/KK01.html    #     <img src="/assets/image/train-info/kakueki/KK01/img_03.png" alt="品川駅の設備図です">    #    # https://www.keikyu.co.jp/assets/image/train-info/kakueki/KK01/img_03.png    "keikyu": {
        "itemEnable": False,        "ocvSitUrl": "https://www.keikyu.co.jp/",        "staLstUrl": "https://www.keikyu.co.jp/train-info/kakueki/index.html",        "reImgLstUrl": "^(/train-info/kakueki/KK[0-9][0-9]\.htm.*)",        "reImgLstId": "\\1",        "fmtImgLstUrl": "https://www.keikyu.co.jp{}",        "reStaImgUrl": "^(/assets/image/train-info/kakueki/KK[0-9][0-9]/img_03\..+)$",        "reStaImgId": "\\1",        "fmtStaImgUrl": "https://www.keikyu.co.jp{}",        "reStaImgUrlFilBas": "^.*/assets/image/train-info/kakueki/(KK[0-9][0-9])/img_03.[^.]+.*$",        "reStaImgUrlFilExt": "^.*/assets/image/train-info/kakueki/KK[0-9][0-9]/img_03.([^.]+).*$",        "regStaImgUrlFilBas": "\\1",        "regStaImgUrlFilExt": "\\1",        "imgUrl": "{}/{}",        "z": "z"    },    # tokyometro    #    # https://www.tokyometro.jp/station/index03.html    #                   <a href="./akihabara/index.html">    #    # https://www.tokyometro.jp/station/akihabara/yardmap/index.html#adjacent    #               <p class="v2_yardmapImg"><img src="../../yardmap_img/_station_%E7%A7%8B%E8%91%89%E5%8E%9F_yardmap_images_yardmap.jpg" alt="" class="v2_js-yardmapImg"></p>    #               <p class="v2_yardmapImg"><img src="../../yardmap_img/figure_yardmap_ayase.gif" alt="" class="v2_js-yardmapImg"></p>    #    # https://www.tokyometro.jp/station/yardmap_img/_station_%E7%A7%8B%E8%91%89%E5%8E%9F_yardmap_images_yardmap.jpg    # https://www.tokyometro.jp/station/yardmap_img/figure_yardmap_ayase.gif    "tokyometro": {
        "itemEnable": False,        "ocvSitUrl": "https://www.tokyometro.jp/",        "staLstUrl": "https://www.tokyometro.jp/station/index03.html",        "reImgLstUrl": "^\./(.+)/index\.html$",        "reImgLstId": "\\1",        "fmtImgLstUrl": "https://www.tokyometro.jp/station/{}/yardmap/index.html",        "reStaImgUrl": "^\.\./\.\./(yardmap_img/.+\..+)$",        "reStaImgId": "\\1",        "fmtStaImgUrl": "https://www.tokyometro.jp/station/{}",        "reStaImgUrlFilBas": "^.*https://www.tokyometro.jp/station/([^/]+)/yardmap/index.html.*$",        "reStaImgUrlFilExt": "^.*/yardmap_img/.+\.([^.]+).*$",        "regStaImgUrlFilBas": "\\1",        "regStaImgUrlFilExt": "\\1",        "imgUrl": "{}/{}",        "z": "z"    }

}

ocvItvlt = 10
def getStaImg(ocvSit, imgLstUrl, staImgUrl):
    reStaImgUrlFilBas = staSitLst[ocvSit]["reStaImgUrlFilBas"]
    regStaImgUrlFilBas = staSitLst[ocvSit]["regStaImgUrlFilBas"]
    reStaImgUrlFilExt = staSitLst[ocvSit]["reStaImgUrlFilExt"]
    regStaImgUrlFilExt = staSitLst[ocvSit]["regStaImgUrlFilExt"]
    imgFilBas = re.sub(reStaImgUrlFilBas, regStaImgUrlFilBas, imgLstUrl + "," + staImgUrl)
    imgFilExt = re.sub(reStaImgUrlFilExt, regStaImgUrlFilExt, imgLstUrl + "," + staImgUrl)
    fmtImgDirPth = "/".join(["data", "_img", "{}", "floormap"])
    fmtImgFilPth = "/".join([fmtImgDirPth, "{}.{}"])
    imgDirPth = fmtImgDirPth.format(ocvSit)
    imgFilPth = fmtImgFilPth.format(ocvSit, imgFilBas, imgFilExt)
    print("# staImgUrl=" + staImgUrl)
    print("# imgFilPth=" + imgFilPth)
    os.makedirs(imgDirPth, exist_ok=True)
    staImg = requests.get(staImgUrl)
    with open(imgFilPth, "wb") as file:
            file.write(staImg.content)
    time.sleep(ocvItvlt)

def getImgLst(ocvSit, imgLstUrl):
    reStaImgUrl = staSitLst[ocvSit]["reStaImgUrl"]
    reStaImgId = staSitLst[ocvSit]["reStaImgId"]
    fmtStaImgUrl = staSitLst[ocvSit]["fmtStaImgUrl"]
    staImgPag = requests.get(imgLstUrl)
    soup = BeautifulSoup(staImgPag.text, "lxml")
    staImgImgTags = soup.find_all("img", src=re.compile(reStaImgUrl))
    for staImgImgTag in staImgImgTags:
        staImgId = re.sub(reStaImgUrl, reStaImgId, staImgImgTag["src"])
        staImgUrl = fmtStaImgUrl.format(staImgId)
        print("# staImgId=" + staImgId + ", staImgUrl=" + staImgUrl)
        getStaImg(ocvSit, imgLstUrl, staImgUrl)

def getStaLst(ocvSit, staLstUrl):
    reImgLstUrl = staSitLst[ocvSit]["reImgLstUrl"]
    reImgLstId = staSitLst[ocvSit]["reImgLstId"]
    fmtImgLstUrl = staSitLst[ocvSit]["fmtImgLstUrl"]
    staLstPag = requests.get(staLstUrl)
    soup = BeautifulSoup(staLstPag.text, "lxml")
    staPagATags = soup.find_all("a", href=re.compile(reImgLstUrl))
    for staPagATag in staPagATags:
        staPagId = re.sub(reImgLstUrl, reImgLstId, staPagATag["href"])
        imgLstUrl = fmtImgLstUrl.format(staPagId)
        print("# staPagId=" + staPagId + ", imgLstUrl=" + imgLstUrl)
        getImgLst(ocvSit, imgLstUrl)

def getSitLst(staSitLst):
    for staSitNam in staSitLst.keys():
        if staSitLst[staSitNam]["itemEnable"] == True:
            staLstUrl = staSitLst[staSitNam]["staLstUrl"]
            print("# staSitNam=" + staSitNam + ", staLstUrl=" + staLstUrl)
            getStaLst(staSitNam, staLstUrl)

def getSit():
    getSitLst(staSitLst)

getSit()

0 件のコメント:

コメントを投稿