[Web crawling] Image web crawling (python)

Soo_buglosschestnut 2020. 7. 30. 21:41

Image web crawling (python)

from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import os


def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
            print('created '+directory)
    except OSError:
        print('Error : already Created' + directory)

search = input('검색 ')
createFolder('data/'+search)
print('1')
url = f'https://www.google.co.kr/search?q={quote_plus(search)}&tbm=isch'

print("2")
driver = webdriver.Chrome("C:\project\crawling\chromedriver.exe")
driver.get(url)
print("3")
options = webdriver.ChromeOptions()
print("4")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
print("5")
driver.get(url)
print("6")
for i in range(500):
    driver.execute_script("window.scrollBy(0,50000)") # 스크롤 얼마나 할지 # 50000
html = driver.page_source
print("7")
soup = BeautifulSoup(html, features="html.parser")
print("8")
img = soup.select('img')

n = 1
imgurl = []

for i in img:
    try:
        imgurl.append(i.attrs["src"])
    except KeyError:
        imgurl.append(i.attrs["data-src"])

for i in imgurl:
    urlretrieve(i, "data/"+ search +'/' + search + str(n) + ".jpg")
    # urlretrieve(i, "data/"+ search +'/' + str(n) + ".jpg")
    n += 1
    print('downloading.........{}'.format(n))

driver.close()

driver = webdriver.Chrome("C:\project\crawling\chromedriver.exe")

chrome webdriver를 download한 후에 code가 작성된 폴더 안에 .exe파일을 넣어준다.

driver.execute_script("window.scrollBy(0,50000)")

50000 → 숫자 원하는 숫자로 바꾸기 / 스크롤 횟수 정하는 코드

참조

출처
https://blog.naver.com/khw11044/222014819050

구글 이미지 크롤링하기

#구글 #이미지 #이미지_크롤링2020년 06월 28일 작성파이썬으로 구글 이미지를 크롤링한다.구글 검색...

blog.naver.com