Scrape value from table of webpage

I want to retrieve the match data from the following website:

https://understat.com/match/81

I wrote the following script:

import sys

import time 

import os

import io

import csv



from selenium import webdriver

import selenium.webdriver.support.expected_conditions as ec

import selenium.webdriver.support.ui as ui

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By



driver = None

cnx = None

currentDir = sys.path[0]



def scrap_understat():

   init_browser('firefox')



   for i in range(80, 10080):

      try:

         driver.get('https://understat.com/match/' + str(i))

         time.sleep(1)

         if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:

            continue



         data = get_match_data()

         save_data(data)



      except Exception as ex:

         log_this(ex)

         print(str(ex))



   close_browser()





def get_match_data():

   data = 



   teams = driver.find_elements(By.NAME, 'team')



   for team in teams:

       team.find_element(By.XPATH, 'following-sibling::*').click()

       time.sleep(1)

       players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

       for player in players:

           tds = player.find_elements(By.TAG_NAME, 'td')

           record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]

           data.append(record)



   return data





def save_data(data):

# save CSV

  csv_file_path = currentDir + '/output' + current_time + '.csv'

  file = None

  writer = None

  if not os.path.exists(csv_file_path):

     file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')

     writer = csv.writer(file)

     writer.writerow(

        ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',

         'xA'])

  else:

      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')

      writer = csv.writer(file)

  for record in data:

      writer.writerow(record)

      file.close()

The output of my script looks like this:

enter image description here

So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class

Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
enter image description here

asked Nov 23 '18 at 8:27

HJA24

13714

add a comment |

I want to retrieve the match data from the following website:

https://understat.com/match/81

I wrote the following script:

import sys

import time 

import os

import io

import csv



from selenium import webdriver

import selenium.webdriver.support.expected_conditions as ec

import selenium.webdriver.support.ui as ui

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By



driver = None

cnx = None

currentDir = sys.path[0]



def scrap_understat():

   init_browser('firefox')



   for i in range(80, 10080):

      try:

         driver.get('https://understat.com/match/' + str(i))

         time.sleep(1)

         if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:

            continue



         data = get_match_data()

         save_data(data)



      except Exception as ex:

         log_this(ex)

         print(str(ex))



   close_browser()





def get_match_data():

   data = 



   teams = driver.find_elements(By.NAME, 'team')



   for team in teams:

       team.find_element(By.XPATH, 'following-sibling::*').click()

       time.sleep(1)

       players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

       for player in players:

           tds = player.find_elements(By.TAG_NAME, 'td')

           record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]

           data.append(record)



   return data





def save_data(data):

# save CSV

  csv_file_path = currentDir + '/output' + current_time + '.csv'

  file = None

  writer = None

  if not os.path.exists(csv_file_path):

     file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')

     writer = csv.writer(file)

     writer.writerow(

        ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',

         'xA'])

  else:

      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')

      writer = csv.writer(file)

  for record in data:

      writer.writerow(record)

      file.close()

The output of my script looks like this:

enter image description here

Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
enter image description here

asked Nov 23 '18 at 8:27

HJA24

13714

add a comment |

I want to retrieve the match data from the following website:

https://understat.com/match/81

I wrote the following script:

import sys

import time 

import os

import io

import csv



from selenium import webdriver

import selenium.webdriver.support.expected_conditions as ec

import selenium.webdriver.support.ui as ui

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By



driver = None

cnx = None

currentDir = sys.path[0]



def scrap_understat():

   init_browser('firefox')



   for i in range(80, 10080):

      try:

         driver.get('https://understat.com/match/' + str(i))

         time.sleep(1)

         if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:

            continue



         data = get_match_data()

         save_data(data)



      except Exception as ex:

         log_this(ex)

         print(str(ex))



   close_browser()





def get_match_data():

   data = 



   teams = driver.find_elements(By.NAME, 'team')



   for team in teams:

       team.find_element(By.XPATH, 'following-sibling::*').click()

       time.sleep(1)

       players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

       for player in players:

           tds = player.find_elements(By.TAG_NAME, 'td')

           record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]

           data.append(record)



   return data





def save_data(data):

# save CSV

  csv_file_path = currentDir + '/output' + current_time + '.csv'

  file = None

  writer = None

  if not os.path.exists(csv_file_path):

     file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')

     writer = csv.writer(file)

     writer.writerow(

        ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',

         'xA'])

  else:

      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')

      writer = csv.writer(file)

  for record in data:

      writer.writerow(record)

      file.close()

The output of my script looks like this:

enter image description here

Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
enter image description here

asked Nov 23 '18 at 8:27

HJA24

13714

I want to retrieve the match data from the following website:

https://understat.com/match/81

I wrote the following script:

import sys

import time 

import os

import io

import csv



from selenium import webdriver

import selenium.webdriver.support.expected_conditions as ec

import selenium.webdriver.support.ui as ui

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By



driver = None

cnx = None

currentDir = sys.path[0]



def scrap_understat():

   init_browser('firefox')



   for i in range(80, 10080):

      try:

         driver.get('https://understat.com/match/' + str(i))

         time.sleep(1)

         if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:

            continue



         data = get_match_data()

         save_data(data)



      except Exception as ex:

         log_this(ex)

         print(str(ex))



   close_browser()





def get_match_data():

   data = 



   teams = driver.find_elements(By.NAME, 'team')



   for team in teams:

       team.find_element(By.XPATH, 'following-sibling::*').click()

       time.sleep(1)

       players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

       for player in players:

           tds = player.find_elements(By.TAG_NAME, 'td')

           record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]

           data.append(record)



   return data





def save_data(data):

# save CSV

  csv_file_path = currentDir + '/output' + current_time + '.csv'

  file = None

  writer = None

  if not os.path.exists(csv_file_path):

     file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')

     writer = csv.writer(file)

     writer.writerow(

        ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',

         'xA'])

  else:

      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')

      writer = csv.writer(file)

  for record in data:

      writer.writerow(record)

      file.close()

The output of my script looks like this:

enter image description here

Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
enter image description here

python selenium web-scraping

asked Nov 23 '18 at 8:27

HJA24

13714

asked Nov 23 '18 at 8:27

HJA24

13714

asked Nov 23 '18 at 8:27

HJA24

13714

asked Nov 23 '18 at 8:27

HJA24

13714

asked Nov 23 '18 at 8:27

HJA24

13714

add a comment |

2 Answers
2

active

oldest

votes

Try this one to avoid matching sub text:

record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text,

          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]), 

          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]

To get teamnames you can use

home = driver.find_element_by_xpath('//label[@for="team-home"]').text

away = driver.find_element_by_xpath('//label[@for="team-away"]').text

P.S. Consider to use Waits instead of time.sleep

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

add a comment |

It looks like you just need to remove the sup's:

driver.execute_script("$('sup').remove()")

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53443028%2fscrape-value-from-table-of-webpage%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

Try this one to avoid matching sub text:

record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text,

          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]), 

          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]

To get teamnames you can use

home = driver.find_element_by_xpath('//label[@for="team-home"]').text

away = driver.find_element_by_xpath('//label[@for="team-away"]').text

P.S. Consider to use Waits instead of time.sleep

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

add a comment |

Try this one to avoid matching sub text:

record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text,

          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]), 

          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]

To get teamnames you can use

home = driver.find_element_by_xpath('//label[@for="team-home"]').text

away = driver.find_element_by_xpath('//label[@for="team-away"]').text

P.S. Consider to use Waits instead of time.sleep

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

add a comment |

Try this one to avoid matching sub text:

record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text,

          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]), 

          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]

To get teamnames you can use

home = driver.find_element_by_xpath('//label[@for="team-home"]').text

away = driver.find_element_by_xpath('//label[@for="team-away"]').text

P.S. Consider to use Waits instead of time.sleep

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

Try this one to avoid matching sub text:

record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,

                  tds[5].text, tds[6].text, tds[7].text,

          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]), 

          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]

To get teamnames you can use

home = driver.find_element_by_xpath('//label[@for="team-home"]').text

away = driver.find_element_by_xpath('//label[@for="team-away"]').text

P.S. Consider to use Waits instead of time.sleep

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

edited Nov 23 '18 at 8:47

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

answered Nov 23 '18 at 8:38

Andersson

39.1k113669

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

add a comment |

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

– HJA24
Nov 23 '18 at 9:38

@HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

– Andersson
Nov 23 '18 at 9:48

add a comment |

It looks like you just need to remove the sup's:

driver.execute_script("$('sup').remove()")

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

add a comment |

It looks like you just need to remove the sup's:

driver.execute_script("$('sup').remove()")

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

add a comment |

It looks like you just need to remove the sup's:

driver.execute_script("$('sup').remove()")

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

It looks like you just need to remove the sup's:

driver.execute_script("$('sup').remove()")

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

answered Nov 23 '18 at 9:22

pguardiario

36.8k980117

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Argthtjtr