Scrape value from table of webpage
I want to retrieve the match data from the following website:
https://understat.com/match/81
I wrote the following script:
import sys
import time
import os
import io
import csv
from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None
cnx = None
currentDir = sys.path[0]
def scrap_understat():
init_browser('firefox')
for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue
data = get_match_data()
save_data(data)
except Exception as ex:
log_this(ex)
print(str(ex))
close_browser()
def get_match_data():
data =
teams = driver.find_elements(By.NAME, 'team')
for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)
return data
def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()
The output of my script looks like this:
So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class
Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
python selenium web-scraping
add a comment |
I want to retrieve the match data from the following website:
https://understat.com/match/81
I wrote the following script:
import sys
import time
import os
import io
import csv
from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None
cnx = None
currentDir = sys.path[0]
def scrap_understat():
init_browser('firefox')
for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue
data = get_match_data()
save_data(data)
except Exception as ex:
log_this(ex)
print(str(ex))
close_browser()
def get_match_data():
data =
teams = driver.find_elements(By.NAME, 'team')
for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)
return data
def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()
The output of my script looks like this:
So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class
Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
python selenium web-scraping
add a comment |
I want to retrieve the match data from the following website:
https://understat.com/match/81
I wrote the following script:
import sys
import time
import os
import io
import csv
from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None
cnx = None
currentDir = sys.path[0]
def scrap_understat():
init_browser('firefox')
for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue
data = get_match_data()
save_data(data)
except Exception as ex:
log_this(ex)
print(str(ex))
close_browser()
def get_match_data():
data =
teams = driver.find_elements(By.NAME, 'team')
for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)
return data
def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()
The output of my script looks like this:
So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class
Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
python selenium web-scraping
I want to retrieve the match data from the following website:
https://understat.com/match/81
I wrote the following script:
import sys
import time
import os
import io
import csv
from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
driver = None
cnx = None
currentDir = sys.path[0]
def scrap_understat():
init_browser('firefox')
for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue
data = get_match_data()
save_data(data)
except Exception as ex:
log_this(ex)
print(str(ex))
close_browser()
def get_match_data():
data =
teams = driver.find_elements(By.NAME, 'team')
for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)
return data
def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()
The output of my script looks like this:
So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class
Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
python selenium web-scraping
python selenium web-scraping
asked Nov 23 '18 at 8:27
HJA24HJA24
13714
13714
add a comment |
add a comment |
2 Answers
2
active
oldest
votes
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[@for="team-home"]').text
away = driver.find_element_by_xpath('//label[@for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , tryfor team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then useteam_name
in players loop - this value will be the same for all team players
– Andersson
Nov 23 '18 at 9:48
add a comment |
It looks like you just need to remove the sup
's:
driver.execute_script("$('sup').remove()")
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53443028%2fscrape-value-from-table-of-webpage%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[@for="team-home"]').text
away = driver.find_element_by_xpath('//label[@for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , tryfor team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then useteam_name
in players loop - this value will be the same for all team players
– Andersson
Nov 23 '18 at 9:48
add a comment |
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[@for="team-home"]').text
away = driver.find_element_by_xpath('//label[@for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , tryfor team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then useteam_name
in players loop - this value will be the same for all team players
– Andersson
Nov 23 '18 at 9:48
add a comment |
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[@for="team-home"]').text
away = driver.find_element_by_xpath('//label[@for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
Try this one to avoid matching sub text:
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text,
driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]
To get teamnames you can use
home = driver.find_element_by_xpath('//label[@for="team-home"]').text
away = driver.find_element_by_xpath('//label[@for="team-away"]').text
P.S. Consider to use Waits instead of time.sleep
edited Nov 23 '18 at 8:47
answered Nov 23 '18 at 8:38
AnderssonAndersson
39.1k113669
39.1k113669
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , tryfor team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then useteam_name
in players loop - this value will be the same for all team players
– Andersson
Nov 23 '18 at 9:48
add a comment |
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , tryfor team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then useteam_name
in players loop - this value will be the same for all team players
– Andersson
Nov 23 '18 at 9:48
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this
– HJA24
Nov 23 '18 at 9:38
@HJA24 , try
for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then use team_name
in players loop - this value will be the same for all team players– Andersson
Nov 23 '18 at 9:48
@HJA24 , try
for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text
. You can then use team_name
in players loop - this value will be the same for all team players– Andersson
Nov 23 '18 at 9:48
add a comment |
It looks like you just need to remove the sup
's:
driver.execute_script("$('sup').remove()")
add a comment |
It looks like you just need to remove the sup
's:
driver.execute_script("$('sup').remove()")
add a comment |
It looks like you just need to remove the sup
's:
driver.execute_script("$('sup').remove()")
It looks like you just need to remove the sup
's:
driver.execute_script("$('sup').remove()")
answered Nov 23 '18 at 9:22
pguardiariopguardiario
36.8k980117
36.8k980117
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53443028%2fscrape-value-from-table-of-webpage%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown