メモ

主にプログラミング系の備忘録

おまけ、テスト用のデータとして、青空文庫のHTMLから見出し/ルビ/注釈を除去したテキストを取得

Public Function getAozora(ByVal sUrl As String) As String
  Dim ie As InternetExplorer: Set ie = New InternetExplorer: ie.navigate sUrl: Call ieCheck(ie)
  Dim dom As HTMLDocument: Set dom = ie.document
  Dim sTitle As String: Dim sAuthor As String
  sTitle = dom.getElementsByClassName("title")(0).innerText
  sAuthor = dom.getElementsByClassName("author")(0).innerText
  
  Dim el As IHTMLElement
  'ルビの削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("ruby")
    el.outerHTML = el.getElementsByTagName("rb")(0).innerText
  Next el
  '注釈の削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("span")
    el.outerHTML = ""
  Next el
  '見出しの削除
  For Each el In dom.getElementsByClassName("main_text")(0).getElementsByTagName("div")
    el.outerHTML = ""
  Next el
  
  
  getAozora = dom.getElementsByClassName("main_text")(0).innerText ' sTitle & vbTab & sAuthor & vbTab &

  ie.Quit
End Function