-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
181 lines (140 loc) · 7.57 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from selenium import webdriver #Used for automating web browsing. It's used here to control a browser to interact with dynamic pages.
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
driver_path = 'C:/Users/mahin/Downloads/chromedriver_win32/chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service) #Initializes a Chrome browser session using the WebDriver service
base_url = 'https://exhibitors.electronica.de/prj_807/view/?nv=2&lng=2' # The URL of the page to scrape
driver.get(base_url) #Opens the provided URL in the Chrome browser controlled by Selenium.
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'urlShare'))) #Ensures the page has fully loaded by waiting for an element (urlShare class) to appear.
page_number = 1
total_pages = 2 # Adjust this total page number as needed, i set now only two pages
while page_number <= total_pages:
print(f"Scraping page {page_number}...")
page_source = driver.page_source #Fetches the HTML source of the current page.
soup = BeautifulSoup(page_source, 'html.parser') # Parses the HTML content to make it easier to navigate and search for elements.
company_names = soup.find_all('a', class_='urlShare')
for company in company_names:
company_name = company.get_text(strip=True) #Extracts the text of each company name, stripping any leading/trailing whitespace.
print(f"Extracting details for: {company_name}")
company_link = company.get('href') #Extracts the link of each company.
driver.get(company_link) #Opens the link of each company in the Chrome browser controlled by Selenium.
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'ce_addr')))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
industry_category_details_page = "N/A"
try:
# Look for the industry category on the company details page
industry_category_details_element = driver.find_element(By.CSS_SELECTOR, '.ce_text')
if industry_category_details_element:
industry_category_details_page = industry_category_details_element.text.strip()
except Exception as e:
print(f"Industry category not found on details page for {company_name}. Error: {e}")
social_media = {
'LinkedIn': 'Nill',
'YouTube': 'Nill',
'Instagram': 'Nill',
'Facebook': 'Nill',
'Twitter': 'Nill'
}
try:
linkedin_element = driver.find_element(By.CSS_SELECTOR, '.ce_smch.ce_LinkedIn a')
social_media['LinkedIn'] = linkedin_element.get_attribute('href').strip()
except Exception as e:
print(f"LinkedIn not found for {company_name}. Error: {e}")
try:
# Extract YouTube link
youtube_element = driver.find_element(By.CSS_SELECTOR, '.ce_smch.ce_YouTube a')
social_media['YouTube'] = youtube_element.get_attribute('href').strip()
except Exception as e:
print(f"YouTube not found for {company_name}. Error: {e}")
try:
# Extract Instagram link (not found make as nill)
instagram_element = driver.find_element(By.CSS_SELECTOR, '.ce_smch.ce_Instagram a')
social_media['Instagram'] = instagram_element.get_attribute('href').strip()
except Exception as e:
print(f"Instagram not found for {company_name}. Error: {e}")
try:
# Extract Facebook link
facebook_element = driver.find_element(By.CSS_SELECTOR, '.ce_smch.ce_Facebook a')
social_media['Facebook'] = facebook_element.get_attribute('href').strip()
except Exception as e:
print(f"Facebook not found for {company_name}. Error: {e}")
try:
# Extract Twitter link
twitter_element = driver.find_element(By.CSS_SELECTOR, '.ce_smch.ce_Twitter a')
social_media['Twitter'] = twitter_element.get_attribute('href').strip()
except Exception as e:
print(f"Twitter not found for {company_name}. Error: {e}")
# Extract the address
try:
address_element = driver.find_element(By.CSS_SELECTOR, '.ce_addr')
address = address_element.text.strip()
print(f"Address for {company_name}: {address}")
except Exception as e:
address = 'Nill'
print(f"Address not found for {company_name}. Error: {e}")
# Extract the email
try:
email_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.ce_email a'))
)
email = email_element.get_attribute('href').replace('mailto:', '').strip()
except Exception as e:
email = 'Nill'
print(f"Email not found for {company_name}. Error: {e}")
# Extract the phone number
try:
phone_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.ce_phone a'))
)
phone = phone_element.get_attribute('href').replace('tel:', '').strip()
except Exception as e:
phone = 'Nill'
print(f"Phone not found for {company_name}. Error: {e}")
# Extract the website URL
try:
website_element = driver.find_element(By.CSS_SELECTOR, '.ce_website a')
website = website_element.get_attribute('href').strip()
except Exception as e:
website = 'Nill'
print(f"Website not found for {company_name}. Error: {e}")
# Extract the products and services
products_services = []
try:
products_section = driver.find_elements(By.CSS_SELECTOR, '.ce_head h2 a')
for product in products_section:
product_name = product.text.strip()
products_services.append(product_name)
except Exception as e:
print(f"Products/Services not found for {company_name}. Error: {e}")
company_profile = 'Nill'
try:
profile_element = driver.find_element(By.XPATH, "//div[contains(@class, 'ce_cntnt')]//div[contains(@class, 'ce_text')]")
company_profile = profile_element.text.strip()
except Exception as e:
print(f"Company profile not found for {company_name}. Error: {e}")
print(f"Details for {company_name}:")
print(f" Industry Category (Details page): {industry_category_details_page}")
print(f" Address: {address}")
print(f" Email: {email}")
print(f" Phone: {phone}")
print(f" Website: {website}")
print(f" Social Media Links: {social_media}")
print(f" Products/Services: {products_services}")
print(f"Company Profile for {company_name}: {company_profile}")
time.sleep(2)
driver.back() #Navigates back to the previous page (list of companies).
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'input[name="SRField_next"][value="next"]')
next_button.click()
time.sleep(5)
page_number += 1
except Exception as e:
print("No more pages or unable to navigate to the next page.")
break
driver.quit() #Closes the browser session.