SELENIUM

概念：基于浏览器自动化的模块
自动化：可以通过代码指定一些列的行为动作，然后将其作用到浏览器中。
selenium和爬虫的关联
- 便捷的捕获到任意形式动态加载的数据(可见即可得)
- 实现模拟登录

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21


from selenium import webdriver
from time import sleep


# 1.基于浏览器的驱动程序实例化一个浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# 对目的网站发请求
bro.get('https://www.jd.com/')
# 定位标签
search_text = bro.find_element_by_xpath('//*[@id="key"]')
# 像标签中录入数据
search_text.send_keys('iPhone11')

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()

sleep(5)
# 在搜索结果页面进行滚轮向下滑动的操作(执行j操作：js注入)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(5)
bro.quit()

爬取数据

古诗词网为例：https://www.gushiwen.org/

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35


from selenium import webdriver
from time import sleep
from lxml import etree

url = 'https://www.gushiwen.org/'
bro = webdriver.Edge(executable_path='./msedgedriver.exe')
bro.get(url)
page_text_list = []
sleep(1)
# 捕获当前页面对应的源码
page_text = bro.page_source
page_text_list.append(page_text)
# 点击下一页
for i in range(2):
    next_page = bro.find_element_by_xpath('//*[@id="amore"]')
    next_page.click()
    sleep(1)
    page_text_list.append(bro.page_source)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    li_list = tree.xpath('/html/body/div[2]/div[1]')
    for li in li_list:
        title = li.xpath('//p[1]/a/b/text()')
        print(title)

for page_text in page_text_list:
    tree = etree.HTML(page_text)
    n_list = tree.xpath('/html/body/div[2]/div[1]')
    for n in n_list:
        context = n.xpath('//div[@class="contson"]/text()')
        print(context)

sleep(2)
bro.quit()

selenium的弊端：
- 效率低

动作链Action Chains

动作链：一系列连续的动作(滑动动作)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


from time import sleep
from selenium import webdriver
  from selenium.webdriver import ActionChains

  url = 'https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
  bro = webdriver.Edge(executable_path='./msedgedriver.exe')
  bro.get(url)

  sleep(1)

  # 如果通过find系列的函数进行标签定位，如果是存在于iframe下面，则会定位失败
  # 解决方案：switch_to
  bro.switch_to.frame('iframeResult')
  div_tag = bro.find_element_by_xpath('//*[@id="draggable"]')

  action = ActionChains(bro)
  action.click_and_hold(div_tag)

  for i in range(6):
      action.move_by_offset(10, 15).perform() # perform让动作立即执行
      sleep(0.5)
  action.release()
  bro.quit()

如何让selenium规避检测

浏览器接管

找到电脑中安装的谷歌浏览器的驱动程序所在的目录找到。且将目录添加到环境变量中。
打开cmd，输入：
- chrome.exe –remote-debugging-port=9222 –user-data-dir=“一个空文件夹的目录”

执行如下代码：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
# 本机安装好的谷歌驱动程序路路径
chrome_driver = "C:\Program Files (x86)\Gooole\Chrome\Application\chromedriver.exe"

driver = webdriver.Chrome(executable_path=chrome_driver, chrome_options=chrome_options)
print(driver.title)

指定执行结束后，会打开本机安装好的谷歌浏览器

无头浏览器(无可视化界面浏览器)

Google无头浏览器

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
# 创建一个参数对象，用来控制chrome以无界面模式打开
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 驱动路径
path = r'C:\Users\ZBLi\Desktop\1801\day05\ziliao\chromedriver.exe'
# 创建浏览器对象
browser = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
# 上网
url = 'http://www.baidu.com/'
browser.get(url)
time.sleep(3)
browser.save_screenshot('baidu.png')
browser.quit()