python の lxml で html の dom を操作してみる。
実行結果
import sys import lxml.html # div/p0-p4 htmlstr = '<html><body><div class="a"><p>p0</p><p>p1</p><p>p2</p><p>p3</p><p>p4</p></div></body></html>' dom = lxml.html.fromstring(htmlstr) nodes = dom.xpath('//div[@class="a"]') div = nodes[0] # p0 を削除 div.remove(div[0]) print(lxml.html.tostring(dom).decode('utf-8')) # p2 を検索 idx = div.index(div[1]) print('idx: %d' % (idx)) # p2 を更新 div[idx] = lxml.html.fromstring('<p>p2 new</p>') print(lxml.html.tostring(dom).decode('utf-8')) # body を出力 print(lxml.html.tostring(div.getparent()).decode('utf-8')) # p5 を追加 p5 = lxml.html.fromstring('<p>p5</p>') div.append(p5) print(lxml.html.tostring(dom).decode('utf-8'))
実行結果
<html><body><div class="a"><p>p1</p><p>p2</p><p>p3</p><p>p4</p></div></body></html> idx: 1 <html><body><div class="a"><p>p1</p><p>p2 new</p><p>p3</p><p>p4</p></div></body></html> <body><div class="a"><p>p1</p><p>p2 new</p><p>p3</p><p>p4</p></div></body> <html><body><div class="a"><p>p1</p><p>p2 new</p><p>p3</p><p>p4</p><p>p5</p></div></body></html>