世田谷区議会議事録の件
robots.txt
見たらdisallowだったので
議事録スクレイピングの記事は取り下げました
ただなぜdisallowなのか区の担当者に
問い合わせしようと思います
問い合わせ先が電話かFAXという時点で
うーん
選挙ドットコムの過去選挙結果をスクレイピング
選挙ドットコム
go2senkyo.com
の地方選挙の選挙結果をスクレイピング。
ie.navigate のURL部分を引数にすればガッツリとれるような気がする
※世田谷区議選でのみ動作確認
'参照設定 Microsoft HTML Object Library ' Microsoft Internet Controls ' ieCheck: https://www.vba-ie.net/code/iecheck.php Public Sub getSenkyoData() Dim ie As InternetExplorer: Set ie = New InternetExplorer ': ie.Visible = True ie.navigate "https://go2senkyo.com/local/senkyo/9926" Call ieCheck(ie) Dim ws As Worksheet: Set ws = ThisWorkbook.Worksheets.Add Dim dom As HTMLDocument: Set dom = ie.document Dim el As IHTMLElement Dim nRow As Long: nRow = 2 Dim buf As Variant '選挙情報を取得する ws.Range("J1") = "都道府県" ws.Range("K1") = dom.getElementsByClassName("p_local_senkyo_ttl_wrapp")(0).getElementsByTagName("p")(0).innerText ws.Range("J2") = "選挙名" buf = Split(dom.getElementsByTagName("h1")(1).innerText, vbCrLf) ws.Range("K2") = buf(0) ws.Range("J3") = "投票日" ws.Range("K3") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(0).innerText ws.Range("J4") = "投票率" buf = Split(dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(1).innerText, "(") ws.Range("K4") = buf(0) ws.Range("J5") = "定数" buf = Split(dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(0).getElementsByTagName("td")(2).innerText, "/") ws.Range("K5") = buf(0) ws.Range("J6") = "告示日" ws.Range("K6") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(1).getElementsByTagName("td")(0).innerText ws.Range("J7") = "前回投票率" ws.Range("K7") = dom.getElementsByClassName("m_senkyo_data")(0).getElementsByTagName("tr")(1).getElementsByTagName("td")(1).innerText '見出し ws.Range("A1") = "当落" ws.Range("B1") = "氏名" ws.Range("C1") = "氏名カナ" ws.Range("D1") = "所属" ws.Range("E1") = "年齢" ws.Range("F1") = "性別" ws.Range("G1") = "現元新" ws.Range("H1") = "得票数" For Each el In dom.getElementsByClassName("m_senkyo_result_table")(0).getElementsByTagName("tr") If el.getElementsByTagName("td")(0).innerHTML <> "" Then ws.Range("A" & nRow) = "当選" ws.Range("B" & nRow) = Replace(Replace(el.getElementsByClassName("m_senkyo_result_data_ttl")(0).innerText, el.getElementsByClassName("m_senkyo_result_data_kana")(0).innerText, ""), vbCrLf, "") '氏名 ws.Range("C" & nRow) = el.getElementsByClassName("m_senkyo_result_data_kana")(0).innerText 'カナ氏名 ws.Range("D" & nRow) = el.getElementsByClassName("m_senkyo_result_data_circle")(0).innerText '所属 buf = Split(el.getElementsByClassName("m_senkyo_result_data_para")(0).getElementsByTagName("span")(0).innerText, "(") '年齢と性別を分離 ws.Range("E" & nRow) = Replace(buf(0), "歳", "") '年齢 ws.Range("F" & nRow) = Replace(buf(1), ")", "") '性別 ws.Range("G" & nRow) = el.getElementsByClassName("m_senkyo_result_data_para")(0).getElementsByTagName("span")(1).innerText '現元新 ws.Range("H" & nRow) = Trim(Replace(el.getElementsByClassName("right")(0).innerText, "票", "")) '得票数 nRow = nRow + 1 Next el End Sub
PythonでAccessを操作
import win32com.client import pandas as pd cn = win32com.client.gencache.EnsureDispatch('ADODB.Connection') cn.Open( "Provider=Microsoft.ACE.OLEDB.12.0;Data Source=C:/DB.accdb" ) rs = win32com.client.Dispatch('ADODB.Recordset') rs.Open("T",cn,win32com.client.constants.adOpenDynamic,win32com.client.constants.adLockOptimistic) #DELETE cn.Execute("DELETE * FROM T") #データ再登録 df = pd.read_csv("a.csv") for line in df.values: rs.AddNew() for i,col in enumerate(line): rs.Fields(i).Value = col rs.Update() #参照サンプル while not rs.EOF: for i in range(0,rs.Fields.Count -1): print(str(i) + ":" + str(rs.Fields(i).Name) + ":" + str(rs.Fields(i).Value)) rs.MoveNext() rs.Close() cn.Close()
初めてのPython。せっかくだからBackLogAPIもついでにいじってみた
pybacklog https://github.com/netmarkjp/pybacklog
と
xlwings https://www.xlwings.org/
使ってます。
backlogfunc.py
from pybacklog import BacklogClient from xlwings import * SPACE_NAME = '' API_KEY = '' PROJECT_KEY = '' # バージョン/ マイルストーン情報登録 def setVersion(VersionName,StartDate,ReleaseDate): client = BacklogClient(SPACE_NAME, API_KEY) response = client.create_version(PROJECT_KEY,VersionName,StartDate,ReleaseDate) return response["id"] # 課題情報取得 def getTemplateIssue(IssueKey): client = BacklogClient(SPACE_NAME, API_KEY) # テンプレート課題の情報を取得 TemplateIssue = client.issue(IssueKey) tmp = [TemplateIssue["category"][0]["id"] , TemplateIssue["summary"],TemplateIssue["description"]] return tmp # 課題登録 def setIssue(strSummary,strStartDate,strDueDate,strDescription,strCategoryID,strVersionID,strParentIssueKey=''): client = BacklogClient(SPACE_NAME, API_KEY) #ProjectID PID = client.get_project_id(PROJECT_KEY) #IssueTypeID を取得 IssueTypeList = client.project_issue_types(PROJECT_KEY) for IssueType in IssueTypeList: if IssueType["name"] == "タスク" : IssueTypeID = IssueType["id"] break #課題を登録 ExtraRequestParam = { "description":strDescription, "startDate" :strStartDate, "dueDate":strDueDate, "categoryId[]":strCategoryID, "versionId[]":strVersionID } # 親課題に紐づける場合 if strParentIssueKey !='': ExtraRequestParam["parentIssueId"] = str(client.get_issue_id(strParentIssueKey)) response = client.create_issue(str(PID),strSummary,str(IssueTypeID),"3",ExtraRequestParam) # 課題IDを返す return response["id"] # Excelから課題情報取得 def xl_getIssue(): wb = Book.caller() ws = wb.sheets('WORK') ret = getTemplateIssue(ws.range('A1').value) ws.range('A2').value = ret[0] ws.range('A3').value = ret[1] ws.range('A4').value = ret[2] # Excelからバージョン情報登録 def xl_setVersion(): wb = Book.caller() ws = wb.sheets('WORK') ws.range('A4').value = setVersion(ws.range('A1').value,ws.range('A2').value,ws.range('A3').value) # Excelから親課題を登録 def xl_setIssue_Parent(): wb = Book.caller() ws = wb.sheets('WORK') #debug ws.range('A8').value = setIssue(\ ws.range('A1').value, ws.range('A2').value, ws.range('A3').value, ws.range('A4').value, ws.range('A5').value, ws.range('A6').value) # Excelから子課題を登録 def xl_setIssue_Child(): wb = Book.caller() ws = wb.sheets('WORK') #debug ws.range('A8').value = setIssue(\ ws.range('A1').value, ws.range('A2').value, ws.range('A3').value, ws.range('A4').value, ws.range('A5').value, ws.range('A6').value, ws.range('A7').value)
Excel側はこんな感じ
' 課題情報取得 Public Sub py_getIssue() Dim wb As Workbook: Set wb = ThisWorkbook Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK") wsCalc.Cells.Clear wsCalc.Range("A:A").NumberFormatLocal = "@" wsCalc.Range("A1").Value = "" '課題Key RunPython ("import backlogfunc; backlogfunc.xl_getIssue();") End Sub ' バージョン情報登録 Public Sub py_setVersion() Dim wb As Workbook: Set wb = ThisWorkbook Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK") wsCalc.Cells.Clear wsCalc.Range("A:A").NumberFormatLocal = "@" wsCalc.Range("A1").Value = "EXCELから登録テスト" wsCalc.Range("A2").Value = Format(Now(), "YYYY-MM-DD") wsCalc.Range("A3").Value = Format(Now() + 30, "YYYY-MM-DD") RunPython ("import backlogfunc; backlogfunc.xl_setVersion();") End Sub ' 課題登録 Public Sub py_setIssue() Dim wb As Workbook: Set wb = ThisWorkbook Dim wsCalc As Worksheet: Set wsCalc = wb.Worksheets("WORK") '1:strSummary '2:strStartDate '3:strDueDate '4:strDescription '5:strCategoryID '6:strVersionID '7:strParentIssueKey '8:IssueID(返り値) '試作品につき、課題件名 カテゴリID 詳細はほかの課題から取得 実際は引数に Call py_getIssue Dim sSummary As String: sSummary = wsCalc.Range("A3").Value Dim sCategoryId As String: sCategoryId = wsCalc.Range("A2").Value Dim sDescription As String: sDescription = wsCalc.Range("A4").Value With wsCalc .Cells.Clear .Range("A:A").NumberFormatLocal = "@" .Range("A1").Value = sSummary & "_Excelから登録" .Range("A2").Value = Format(Date, "YYYY-MM-DD") .Range("A3").Value = Format(Date + 30, "YYYY-MM-DD") .Range("A4").Value = sDescription .Range("A5").Value = sCategoryId .Range("A6").Value = "" 'バージョンID .Range("A7").Value = "" '親課題キー(未設定の場合は親課題として登録/設定がある場合は子課題として登録) End With If wsCalc.Range("A7").Value = "" Then RunPython ("import backlogfunc; backlogfunc.xl_setIssue_Parent();") Else RunPython ("import backlogfunc; backlogfunc.xl_setIssue_Child();") End If End Sub
TripAdviserの一覧からスポット情報をスクレイピング
先に記載した関数で取得したURLから、個々のスポット情報を取得
Public Function getSpotList(sUrl As String) As Variant Dim ie As InternetExplorer: Set ie = New InternetExplorer Dim sRet As String Dim dom As HTMLDocument Dim el As IHTMLElement Dim vtmp As Variant Dim f As Boolean Dim i As Long Dim j As Long Dim nPage As Long Dim ie2 As InternetExplorer Dim sFullAddress As String Dim sZipCode As String ie.navigate sUrl nPage = 1 Call MySleep(1000) Call ieCheck(ie) Set dom = ie.Document f = True Do While f For Each el In dom.getElementsByClassName("near_listing") sRet = sRet & ConvTSV(el.getElementsByClassName("location_name")(0).innerText) sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("location_name")(0).getElementsByTagName("a")(0).getAttribute("href")) sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("address_box")(0).innerText) sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("distance")(0).getElementsByTagName("b")(0).innerText) 'ここで住所詳細も取得しよう。 'ただ、一覧だけだと7アクセスで済んだのが 個別ページまでアクセスすると207アクセス必要になるため 'やらない方がいいかも…。 Set ie2 = New InternetExplorer: ie2.navigate "https://www.tripadvisor.jp" & el.getElementsByClassName("location_name")(0).getElementsByTagName("a")(0).getAttribute("href") Call MySleep(1000) Call ieCheck(ie2) sFullAddress = "": sZipCode = "" If ie2.Document.getElementsByClassName("address").length > 0 Then sFullAddress = Trim(ie2.Document.getElementsByClassName("address")(0).innerText) If ie2.Document.getElementsByClassName("postal-code").length > 0 Then sZipCode = Trim(ie2.Document.getElementsByClassName("postal-code")(0).innerText) sFullAddress = Replace(sFullAddress, sZipCode, "") & "\t" & sZipCode sRet = sRet & vbTab & ConvTSV(sFullAddress) End If ie2.Quit 'listinfo→評価が付かなくても入っていることがある…。 If el.getElementsByClassName("listInfo").length > 0 Then sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("listInfo")(0).innerText) ' ws.Range("G" & i).Value = el.getElementsByClassName("listInfo")(0).innerText End If 'ui_bubble_rating If el.getElementsByClassName("ui_bubble_rating").length > 0 Then vtmp = Split(el.getElementsByClassName("ui_bubble_rating")(0).getAttribute("alt"), " ") For j = LBound(vtmp) To UBound(vtmp) sRet = sRet & vbTab & ConvTSV(CStr(vtmp(j))) Next j End If 'more(口コミ件数) If el.getElementsByClassName("more").length > 0 Then sRet = sRet & vbTab & ConvTSV(el.getElementsByClassName("more")(0).innerText) End If 'popranking If el.getElementsByClassName("popRanking").length > 0 Then vtmp = Split(el.getElementsByClassName("popRanking")(0).innerText, " ") For j = LBound(vtmp) To UBound(vtmp) sRet = sRet & vbTab & ConvTSV(CStr(vtmp(j))) Next j End If sRet = sRet & vbCrLf Next el '次のページ存在チェック f = False For Each el In dom.getElementsByClassName("sprite-pageNext") If el.tagName = "A" Then f = True el.Click el.FireEvent ("onclick") Call ieCheck(ie) Call MySleep(1000) nPage = nPage + 1 Exit For End If Next el Loop ie.Quit getSpotList = sRet End Function
おまけ:タブ区切りテキストで出力するための変換関数
Public Function ConvTSV(ByVal str As String) As String Dim sRet As String sRet = Replace(Trim(str), """", """""") 'ダブルクオートのエスケープ & トリム sRet = Replace(sRet, vbCr, "\r") 'CRを\rに変換 sRet = Replace(sRet, vbLf, "\n") 'LFを\nに変換 sRet = """" & sRet & """" '前後にダブルクオートをつける ConvTSV = sRet End Function
サンプルプログラム:
Public Sub test_getSpotList(ParamArray vKeyWords()) Dim fs As FileSystemObject: Set fs = New FileSystemObject Dim vtemp As Variant: vtemp = vKeyWords() Dim sFileName As String Dim i As Long For i = LBound(vKeyWords) To UBound(vKeyWords) If sFileName <> "" Then sFileName = sFileName & "_" sFileName = sFileName & CStr(vKeyWords(i)) Next i Dim ts As TextStream: Set ts = fs.CreateTextFile("C:\" & sFileName & Format(Now(), "yymmddhhnnss") & ".txt") ts.Write getSpotList(getTAURL(vtemp)) ts.Close Set fs = Nothing End Sub
TripAdviserのスポット一覧のURLを取得
GoogleでKeyWordを+でつなぐ+「site:www.tripadvisor.jp」で検索
→検索結果タイトル&本文にKeyWordsが含まれるもののリンク先URLを取得
Hitするものが存在しなかった場合は空文字を返す
KeyWords はスポット名 + 地域名(市区町村あたり) + ジャンル(観光/グルメ)あたりを想定
ieCheckは以下のものを使用 ついでに1秒ウェイトを入れるためのmySleep関数をここに書いてる
https://www.vba-ie.net/code/iecheck.html
Public Function getTAURL(vKeyWords As Variant) As String Dim ie As InternetExplorer: Set ie = New InternetExplorer Dim sKeyWords As String: sKeyWords = "" Dim i As Long Dim f As Boolean For i = LBound(vKeyWords) To UBound(vKeyWords) If sKeyWords <> "" Then sKeyWords = sKeyWords & "+" sKeyWords = sKeyWords & CStr(vKeyWords(i)) Next i ie.navigate "https://www.google.co.jp/search?num=100&q=" & sKeyWords & "+site%3Awww.tripadvisor.jp" Call MySleep(1000) Call ieCheck(ie) Dim el As IHTMLElement ' 地名+駅名とかにするとh3だけの評価ではだめなので、本文部分もチェックする For Each el In ie.Document.getElementsByClassName("srg")(0).getElementsByClassName("rc") f = True For i = LBound(vKeyWords) To UBound(vKeyWords) If InStr(1, el.innerText, CStr(vKeyWords(i))) = 0 Then f = False Exit For End If Next i If f Then Exit For Next el If f Then getTAURL = el.getElementsByTagName("h3")(0).getElementsByTagName("a")(0).getAttribute("href") Else getTAURL = "" End If ie.Quit End Function
おまけ、テスト用のデータとして、青空文庫のHTMLから見出し/ルビ/注釈を除去したテキストを取得
Public Function getAozora(ByVal sUrl As String) As String Dim ie As InternetExplorer: Set ie = New InternetExplorer: ie.navigate sUrl: Call ieCheck(ie) Dim dom As HTMLDocument: Set dom = ie.document Dim sTitle As String: Dim sAuthor As String sTitle = dom.getElementsByClassName("title")(0).innerText sAuthor = dom.getElementsByClassName("author")(0).innerText Dim el As IHTMLElement 'ルビの削除 For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("ruby") el.outerHTML = el.getElementsByTagName("rb")(0).innerText Next el '注釈の削除 For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("span") el.outerHTML = "" Next el '見出しの削除 For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("div") el.outerHTML = "" Next el getAozora = dom.getElementsByClassName("main_text")(0).innerText ' sTitle & vbTab & sAuthor & vbTab & ie.Quit End Function