メモ

主にプログラミング系の備忘録

選挙ドットコムの過去選挙結果をスクレイピング

選挙ドットコム
go2senkyo.com

の地方選挙の選挙結果をスクレイピング
ie.navigate のURL部分を引数にすればガッツリとれるような気がする

※世田谷区議選でのみ動作確認

'参照設定 Microsoft HTML Object Library
'         Microsoft Internet Controls
' ieCheck: https://www.vba-ie.net/code/iecheck.php


Public Sub getSenkyoData()
  Dim ie As InternetExplorer: Set ie = New InternetExplorer ': ie.Visible = True
  ie.navigate "https://go2senkyo.com/local/senkyo/9926"
  Call ieCheck(ie)
  
  Dim ws As Worksheet: Set ws = ThisWorkbook.Worksheets.Add
  
  Dim dom As HTMLDocument: Set dom = ie.document
  Dim el As IHTMLElement
  Dim nRow As Long: nRow = 2
  Dim buf As Variant
  
  '選挙情報を取得する
  ws.Range("J1") = "都道府県"
  ws.Range("K1") = dom.getElementsByClassName("p_local_senkyo_ttl_wrapp")(0).getElementsByTagName("p")(0).innerText
  
  ws.Range("J2") = "選挙名"
  buf = Split(dom.getElementsByTagName("h1")(1).innerText, vbCrLf)
  ws.Range("K2") = buf(0)
  
  ws.Range("J3") = "投票日"
  ws.Range("K3") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(0).innerText
  
  ws.Range("J4") = "投票率"
  buf = Split(dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(1).innerText, "(")
  ws.Range("K4") = buf(0)
  
  ws.Range("J5") = "定数"
  buf = Split(dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(2).innerText, "/")
  ws.Range("K5") = buf(0)
  
  ws.Range("J6") = "告示日"
  ws.Range("K6") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(1).getElementsByTagName("td")(0).innerText
  
  ws.Range("J7") = "前回投票率"
  ws.Range("K7") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(1).getElementsByTagName("td")(1).innerText

  '見出し
  ws.Range("A1") = "当落"
  ws.Range("B1") = "氏名"
  ws.Range("C1") = "氏名カナ"
  ws.Range("D1") = "所属"
  ws.Range("E1") = "年齢"
  ws.Range("F1") = "性別"
  ws.Range("G1") = "現元新"
  ws.Range("H1") = "得票数"
  
  For Each el In dom.getElementsByClassName("m_senkyo_result_table")(0).getElementsByTagName("tr")
    If el.getElementsByTagName("td")(0).innerHTML <> "" Then ws.Range("A" & nRow) = "当選"
    ws.Range("B" & nRow) = Replace(Replace(el.getElementsByClassName("m_senkyo_result_data_ttl")(0).innerText, el.getElementsByClassName("m_senkyo_result_data_kana")(0).innerText, ""), vbCrLf, "") '氏名
    ws.Range("C" & nRow) = el.getElementsByClassName("m_senkyo_result_data_kana")(0).innerText 'カナ氏名
    ws.Range("D" & nRow) = el.getElementsByClassName("m_senkyo_result_data_circle")(0).innerText '所属
    buf = Split(el.getElementsByClassName("m_senkyo_result_data_para")(0).getElementsByTagName("span")(0).innerText, "(") '年齢と性別を分離
    ws.Range("E" & nRow) = Replace(buf(0), "歳", "") '年齢
    ws.Range("F" & nRow) = Replace(buf(1), ")", "") '性別
    ws.Range("G" & nRow) = el.getElementsByClassName("m_senkyo_result_data_para")(0).getElementsByTagName("span")(1).innerText '現元新
    ws.Range("H" & nRow) = Trim(Replace(el.getElementsByClassName("right")(0).innerText, "票", "")) '得票数
    nRow = nRow + 1
  Next el

End Sub

PythonでAccessを操作

import win32com.client
import pandas as pd

cn = win32com.client.gencache.EnsureDispatch('ADODB.Connection')
cn.Open( "Provider=Microsoft.ACE.OLEDB.12.0;Data Source=C:/DB.accdb" )

rs = win32com.client.Dispatch('ADODB.Recordset')
rs.Open("T",cn,win32com.client.constants.adOpenDynamic,win32com.client.constants.adLockOptimistic)

#DELETE
cn.Execute("DELETE * FROM T")

#データ再登録
df = pd.read_csv("a.csv")
for line in df.values:
    rs.AddNew()
    for i,col in enumerate(line):
        rs.Fields(i).Value = col
    rs.Update()

#参照サンプル
while not rs.EOF:
    for i in range(0,rs.Fields.Count -1):
        print(str(i) + ":" + str(rs.Fields(i).Name) + ":" + str(rs.Fields(i).Value))
    rs.MoveNext()


rs.Close()
cn.Close()

初めてのPython。せっかくだからBackLogAPIもついでにいじってみた

pybacklog https://github.com/netmarkjp/pybacklog

xlwings https://www.xlwings.org/
使ってます。

backlogfunc.py

from pybacklog import BacklogClient
from xlwings import *

SPACE_NAME = ''
API_KEY = ''
PROJECT_KEY = ''

# バージョン/ マイルストーン情報登録
def setVersion(VersionName,StartDate,ReleaseDate):

    client = BacklogClient(SPACE_NAME, API_KEY)

    response = client.create_version(PROJECT_KEY,VersionName,StartDate,ReleaseDate)

    return response["id"]

# 課題情報取得
def getTemplateIssue(IssueKey):
    client = BacklogClient(SPACE_NAME, API_KEY)

    # テンプレート課題の情報を取得
    TemplateIssue = client.issue(IssueKey)

    tmp = [TemplateIssue["category"][0]["id"] , TemplateIssue["summary"],TemplateIssue["description"]]
    
    return tmp
    
# 課題登録
def setIssue(strSummary,strStartDate,strDueDate,strDescription,strCategoryID,strVersionID,strParentIssueKey=''):
    client = BacklogClient(SPACE_NAME, API_KEY)

    #ProjectID
    PID = client.get_project_id(PROJECT_KEY)

    #IssueTypeID を取得
    IssueTypeList = client.project_issue_types(PROJECT_KEY)

    for IssueType in IssueTypeList:
        if IssueType["name"] == "タスク" :
            IssueTypeID = IssueType["id"]
            break

    #課題を登録
    ExtraRequestParam = {
        "description":strDescription,
        "startDate" :strStartDate,
        "dueDate":strDueDate,
        "categoryId[]":strCategoryID,
        "versionId[]":strVersionID
    }


    # 親課題に紐づける場合
    if strParentIssueKey !='':
        ExtraRequestParam["parentIssueId"] = str(client.get_issue_id(strParentIssueKey))

    response = client.create_issue(str(PID),strSummary,str(IssueTypeID),"3",ExtraRequestParam)


    # 課題IDを返す
    return response["id"]

# Excelから課題情報取得

def xl_getIssue():
    wb = Book.caller()
    ws = wb.sheets('WORK')

    ret = getTemplateIssue(ws.range('A1').value)

    ws.range('A2').value = ret[0]
    ws.range('A3').value = ret[1]
    ws.range('A4').value = ret[2]

# Excelからバージョン情報登録
def xl_setVersion():
    wb = Book.caller()
    ws = wb.sheets('WORK')

    ws.range('A4').value = setVersion(ws.range('A1').value,ws.range('A2').value,ws.range('A3').value)

# Excelから親課題を登録
def xl_setIssue_Parent():
    wb = Book.caller()
    ws = wb.sheets('WORK')
    #debug

    ws.range('A8').value = setIssue(\
    ws.range('A1').value,
    ws.range('A2').value,
    ws.range('A3').value,
    ws.range('A4').value,
    ws.range('A5').value,
    ws.range('A6').value)

# Excelから子課題を登録        
def xl_setIssue_Child():
    wb = Book.caller()
    ws = wb.sheets('WORK')
    #debug

    ws.range('A8').value = setIssue(\
    ws.range('A1').value,
    ws.range('A2').value,
    ws.range('A3').value,
    ws.range('A4').value,
    ws.range('A5').value,
    ws.range('A6').value,
    ws.range('A7').value)

Excel側はこんな感じ

' 課題情報取得
Public Sub py_getIssue()
  
  Dim wb As Workbook: Set wb = ThisWorkbook
  Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK")
  
  wsCalc.Cells.Clear
  wsCalc.Range("A:A").NumberFormatLocal = "@"
  
  wsCalc.Range("A1").Value = "" '課題Key
  RunPython ("import backlogfunc; backlogfunc.xl_getIssue();")
  
End Sub

' バージョン情報登録
Public Sub py_setVersion()
  Dim wb As Workbook: Set wb = ThisWorkbook
  Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK")
  
  wsCalc.Cells.Clear
  wsCalc.Range("A:A").NumberFormatLocal = "@"
  
  wsCalc.Range("A1").Value = "EXCELから登録テスト"
  wsCalc.Range("A2").Value = Format(Now(), "YYYY-MM-DD")
  wsCalc.Range("A3").Value = Format(Now() + 30, "YYYY-MM-DD")
  
  RunPython ("import backlogfunc; backlogfunc.xl_setVersion();")


End Sub

' 課題登録
Public Sub py_setIssue()
  Dim wb As Workbook: Set wb = ThisWorkbook
  Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK")
  
  '1:strSummary
  '2:strStartDate
  '3:strDueDate
  '4:strDescription
  '5:strCategoryID
  '6:strVersionID
  '7:strParentIssueKey
  '8:IssueID(返り値)
  
  '試作品につき、課題件名 カテゴリID 詳細はほかの課題から取得 実際は引数に
  Call py_getIssue
  Dim sSummary As String: sSummary = wsCalc.Range("A3").Value
  Dim sCategoryId As String: sCategoryId = wsCalc.Range("A2").Value
  Dim sDescription As String: sDescription = wsCalc.Range("A4").Value
  
  With wsCalc

    .Cells.Clear
    .Range("A:A").NumberFormatLocal = "@"
  
    .Range("A1").Value = sSummary & "_Excelから登録"
    .Range("A2").Value = Format(Date, "YYYY-MM-DD")
    .Range("A3").Value = Format(Date + 30, "YYYY-MM-DD")
    .Range("A4").Value = sDescription
    .Range("A5").Value = sCategoryId
    .Range("A6").Value = "" 'バージョンID
    .Range("A7").Value = "" '親課題キー(未設定の場合は親課題として登録/設定がある場合は子課題として登録)
    
  End With
  
  If wsCalc.Range("A7").Value = "" Then
    RunPython ("import backlogfunc; backlogfunc.xl_setIssue_Parent();")
  Else
    RunPython ("import backlogfunc; backlogfunc.xl_setIssue_Child();")
  End If

End Sub

TripAdviserの一覧からスポット情報をスクレイピング

先に記載した関数で取得したURLから、個々のスポット情報を取得

Public Function getSpotList(sUrl As String) As Variant
  Dim ie As InternetExplorer: Set ie = New InternetExplorer
  Dim sRet As String
  
  Dim dom As HTMLDocument
  Dim el As IHTMLElement
  Dim vtmp As Variant
  Dim f As Boolean
  Dim i As Long
  Dim j As Long
  Dim nPage As Long
  Dim ie2 As InternetExplorer
  Dim sFullAddress As String
  Dim sZipCode As String
  ie.navigate sUrl
  nPage = 1
  Call MySleep(1000)
  Call ieCheck(ie)
  Set dom = ie.Document
  f = True
  
  Do While f
    For Each el In dom.getElementsByClassName("near_listing")
      sRet = sRet & ConvTSV(el.getElementsByClassName("location_name")(0).innerText)
      sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("location_name")(0).getElementsByTagName("a")(0).getAttribute("href"))
      sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("address_box")(0).innerText)
      sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("distance")(0).getElementsByTagName("b")(0).innerText)

'ここで住所詳細も取得しよう。
'ただ、一覧だけだと7アクセスで済んだのが 個別ページまでアクセスすると207アクセス必要になるため
'やらない方がいいかも…。

      Set ie2 = New InternetExplorer: ie2.navigate "https://www.tripadvisor.jp" & el.getElementsByClassName("location_name")(0).getElementsByTagName("a")(0).getAttribute("href")
      Call MySleep(1000)
      Call ieCheck(ie2)
      
      sFullAddress = "": sZipCode = ""
      
      If ie2.Document.getElementsByClassName("address").length > 0 Then
        sFullAddress = Trim(ie2.Document.getElementsByClassName("address")(0).innerText)
        If ie2.Document.getElementsByClassName("postal-code").length > 0 Then sZipCode = Trim(ie2.Document.getElementsByClassName("postal-code")(0).innerText)
        sFullAddress = Replace(sFullAddress, sZipCode, "") & "\t" & sZipCode
        sRet = sRet & vbTab & ConvTSV(sFullAddress)
      End If
      ie2.Quit
      
'listinfo→評価が付かなくても入っていることがある…。
      If el.getElementsByClassName("listInfo").length > 0 Then
        sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("listInfo")(0).innerText)
'          ws.Range("G" & i).Value = el.getElementsByClassName("listInfo")(0).innerText
      End If
'ui_bubble_rating
      If el.getElementsByClassName("ui_bubble_rating").length > 0 Then
        vtmp = Split(el.getElementsByClassName("ui_bubble_rating")(0).getAttribute("alt"), " ")
        For j = LBound(vtmp) To UBound(vtmp)
          sRet = sRet & vbTab & ConvTSV(CStr(vtmp(j)))
        Next j
      End If
'more(口コミ件数)
      If el.getElementsByClassName("more").length > 0 Then
        sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("more")(0).innerText)
      End If
'popranking
      If el.getElementsByClassName("popRanking").length > 0 Then
        vtmp = Split(el.getElementsByClassName("popRanking")(0).innerText, " ")
        For j = LBound(vtmp) To UBound(vtmp)
          sRet = sRet & vbTab & ConvTSV(CStr(vtmp(j)))
        Next j
      End If
      sRet = sRet & vbCrLf
    Next el

'次のページ存在チェック    
    f = False
    For Each el In dom.getElementsByClassName("sprite-pageNext")
      If el.tagName = "A" Then
        f = True
        el.Click
        el.FireEvent ("onclick")
        Call ieCheck(ie)
        Call MySleep(1000)
        nPage = nPage + 1
        Exit For
      End If
    Next el
  Loop
  ie.Quit
  
  getSpotList = sRet
  
End Function

おまけ:タブ区切りテキストで出力するための変換関数

Public Function ConvTSV(ByVal str As String) As String
  Dim sRet As String
  sRet = Replace(Trim(str), """", """""") 'ダブルクオートのエスケープ & トリム
  sRet = Replace(sRet, vbCr, "\r") 'CRを\rに変換
  sRet = Replace(sRet, vbLf, "\n") 'LFを\nに変換
  sRet = """" & sRet & """"  '前後にダブルクオートをつける
  ConvTSV = sRet
  
End Function

サンプルプログラム:

Public Sub test_getSpotList(ParamArray vKeyWords())
  Dim fs As FileSystemObject: Set fs = New FileSystemObject
  Dim vtemp As Variant: vtemp = vKeyWords()
  
  Dim sFileName As String
  Dim i As Long
  For i = LBound(vKeyWords) To UBound(vKeyWords)
    If sFileName <> "" Then sFileName = sFileName & "_"
    sFileName = sFileName & CStr(vKeyWords(i))
  Next i
  Dim ts As TextStream: Set ts = fs.CreateTextFile("C:\" & sFileName & Format(Now(), "yymmddhhnnss") & ".txt")
  ts.Write getSpotList(getTAURL(vtemp))
  ts.Close
  Set fs = Nothing
End Sub

TripAdviserのスポット一覧のURLを取得

GoogleでKeyWordを+でつなぐ+「site:www.tripadvisor.jp」で検索
→検索結果タイトル&本文にKeyWordsが含まれるもののリンク先URLを取得
Hitするものが存在しなかった場合は空文字を返す
KeyWords はスポット名 + 地域名(市区町村あたり) + ジャンル(観光/グルメ)あたりを想定

ieCheckは以下のものを使用 ついでに1秒ウェイトを入れるためのmySleep関数をここに書いてる
https://www.vba-ie.net/code/iecheck.html

Public Function getTAURL(vKeyWords As Variant) As String

  Dim ie As InternetExplorer: Set ie = New InternetExplorer
  Dim sKeyWords As String: sKeyWords = ""
  Dim i As Long
  Dim f As Boolean
  For i = LBound(vKeyWords) To UBound(vKeyWords)
    If sKeyWords <> "" Then sKeyWords = sKeyWords & "+"
    sKeyWords = sKeyWords & CStr(vKeyWords(i))
  Next i
  ie.navigate "https://www.google.co.jp/search?num=100&q=" & sKeyWords & "+site%3Awww.tripadvisor.jp"
  Call MySleep(1000)
  Call ieCheck(ie)
  Dim el As IHTMLElement
  
' 地名+駅名とかにするとh3だけの評価ではだめなので、本文部分もチェックする
  For Each el In ie.Document.getElementsByClassName("srg")(0).getElementsByClassName("rc")
    f = True
    For i = LBound(vKeyWords) To UBound(vKeyWords)
      If InStr(1, el.innerText, CStr(vKeyWords(i))) = 0 Then
        f = False
        Exit For
      End If
    Next i
    If f Then Exit For
  Next el

  If f Then
    getTAURL = el.getElementsByTagName("h3")(0).getElementsByTagName("a")(0).getAttribute("href")
  Else
    getTAURL = ""
  End If
  ie.Quit
End Function

おまけ、テスト用のデータとして、青空文庫のHTMLから見出し/ルビ/注釈を除去したテキストを取得

Public Function getAozora(ByVal sUrl As String) As String
  Dim ie As InternetExplorer: Set ie = New InternetExplorer: ie.navigate sUrl: Call ieCheck(ie)
  Dim dom As HTMLDocument: Set dom = ie.document
  Dim sTitle As String: Dim sAuthor As String
  sTitle = dom.getElementsByClassName("title")(0).innerText
  sAuthor = dom.getElementsByClassName("author")(0).innerText
  
  Dim el As IHTMLElement
  'ルビの削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("ruby")
    el.outerHTML = el.getElementsByTagName("rb")(0).innerText
  Next el
  '注釈の削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("span")
    el.outerHTML = ""
  Next el
  '見出しの削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("div")
    el.outerHTML = ""
  Next el
  
  
  getAozora = dom.getElementsByClassName("main_text")(0).innerText ' sTitle & vbTab & sAuthor & vbTab &

  ie.Quit
End Function