python3-pyppeteer实现批量网页截图

上一版结合pychrome模块截图,实验发现各种问题,就有了这篇文

代码

pip install pyppeteer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import asyncio
from pyppeteer import launch
import time
import sys
import xmltodict
from urllib.parse import urlparse
# 从文本文档读取域名列表
def read_txt(txt_file):
with open(txt_file,encoding='utf-8') as fp:
urls = []
lines = (line.strip() for line in fp)
for url in lines:
urls.append(url)
return urls
#从nmap的xml文件读取ip与端口
def nmap_xml(xml_file):
with open(xml_file,encoding='utf-8') as fp:
xml_url = []
xml_obj = xmltodict.parse(fp.read())
host = xml_obj['nmaprun']['host']
for entry in host:
port = str(entry['ports']['port']['@portid'])
if port:
addr = entry['address']['@addr']
url = addr + ':' + port
xml_url.append(url)
return xml_url
async def HCS():
num = 0
start = time.time()
browser = await launch(headless=True)
page = await browser.newPage()
if sys.argv[1] == '-txt' or sys.argv[1] == '-TXT':
url_list = [ x for x in read_txt(sys.argv[2])]
elif sys.argv[1] == '-xml' or sys.argv[1] == '-XML':
url_list = [x for x in nmap_xml(sys.argv[2])]
for url in url_list:
if '://' not in url:
if ':443' in url:
url = 'https://' + url
else:
url = 'http://' + url
print('[+]Getting a domain name:'+ url)
try:
await page.goto(url)
await page.waitFor(1000)
domain_title = await page.evaluate('''() => {
return {
title:document.querySelector('title').innerText,
}
}''')
url_title = (domain_title['title'][:4]).strip()
url_domain = urlparse(url).netloc
Screenshots = 'image/'+ url_domain + '_' + url_title + '.png'
await page.screenshot({'path': Screenshots})
num += 1
print(num)
print(url)
print(url_title)
except Exception as e:
print (e)
end = time.time()
print ("共截图:"+str(num)+"张")
print ("用时:"+ str(int(end-start))+"秒")
await browser.close()
if __name__ == '__main__':
info = '''
info:
Use python3 Headless_Chrome_Screenshot.py -txt domain.txt --> The text containing the url
Use python3 Headless_Chrome_Screenshot.py -xml domain_nmap.xml --> Xml format nmap scan file
'''
if len(sys.argv) != 3:
print (info)
else:
asyncio.get_event_loop().run_until_complete(HCS())

结果

this is screenshot

Author: 1314mylove
Link: https://blog.1314mylove.com/2018/04/python3-pyppeteer批量网页截图/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.