IT Convergence Engineering/AI 버섯 어플
[Web crawling] Image web crawling (python)
Soo_buglosschestnut
2020. 7. 30. 21:41
Image web crawling (python)
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
import os
def createFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
print('created '+directory)
except OSError:
print('Error : already Created' + directory)
search = input('검색 ')
createFolder('data/'+search)
print('1')
url = f'https://www.google.co.kr/search?q={quote_plus(search)}&tbm=isch'
print("2")
driver = webdriver.Chrome("C:\project\crawling\chromedriver.exe")
driver.get(url)
print("3")
options = webdriver.ChromeOptions()
print("4")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
print("5")
driver.get(url)
print("6")
for i in range(500):
driver.execute_script("window.scrollBy(0,50000)") # 스크롤 얼마나 할지 # 50000
html = driver.page_source
print("7")
soup = BeautifulSoup(html, features="html.parser")
print("8")
img = soup.select('img')
n = 1
imgurl = []
for i in img:
try:
imgurl.append(i.attrs["src"])
except KeyError:
imgurl.append(i.attrs["data-src"])
for i in imgurl:
urlretrieve(i, "data/"+ search +'/' + search + str(n) + ".jpg")
# urlretrieve(i, "data/"+ search +'/' + str(n) + ".jpg")
n += 1
print('downloading.........{}'.format(n))
driver.close()
driver = webdriver.Chrome("C:\project\crawling\chromedriver.exe")
- chrome webdriver를 download한 후에 code가 작성된 폴더 안에 .exe파일을 넣어준다.
driver.execute_script("window.scrollBy(0,50000)")
- 50000 → 숫자 원하는 숫자로 바꾸기 / 스크롤 횟수 정하는 코드
- 참조
출처
https://blog.naver.com/khw11044/222014819050