Scrape value from table of webpage












0















I want to retrieve the match data from the following website:



https://understat.com/match/81



I wrote the following script:



import sys
import time
import os
import io
import csv

from selenium import webdriver
import selenium.webdriver.support.expected_conditions as ec
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

driver = None
cnx = None
currentDir = sys.path[0]

def scrap_understat():
init_browser('firefox')

for i in range(80, 10080):
try:
driver.get('https://understat.com/match/' + str(i))
time.sleep(1)
if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
continue

data = get_match_data()
save_data(data)

except Exception as ex:
log_this(ex)
print(str(ex))

close_browser()


def get_match_data():
data =

teams = driver.find_elements(By.NAME, 'team')

for team in teams:
team.find_element(By.XPATH, 'following-sibling::*').click()
time.sleep(1)
players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
for player in players:
tds = player.find_elements(By.TAG_NAME, 'td')
record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
data.append(record)

return data


def save_data(data):
# save CSV
csv_file_path = currentDir + '/output' + current_time + '.csv'
file = None
writer = None
if not os.path.exists(csv_file_path):
file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
writer.writerow(
['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
'xA'])
else:
file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
writer = csv.writer(file)
for record in data:
writer.writerow(record)
file.close()


The output of my script looks like this:



enter image description here



So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class



Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
enter image description here










share|improve this question



























    0















    I want to retrieve the match data from the following website:



    https://understat.com/match/81



    I wrote the following script:



    import sys
    import time
    import os
    import io
    import csv

    from selenium import webdriver
    import selenium.webdriver.support.expected_conditions as ec
    import selenium.webdriver.support.ui as ui
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By

    driver = None
    cnx = None
    currentDir = sys.path[0]

    def scrap_understat():
    init_browser('firefox')

    for i in range(80, 10080):
    try:
    driver.get('https://understat.com/match/' + str(i))
    time.sleep(1)
    if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
    continue

    data = get_match_data()
    save_data(data)

    except Exception as ex:
    log_this(ex)
    print(str(ex))

    close_browser()


    def get_match_data():
    data =

    teams = driver.find_elements(By.NAME, 'team')

    for team in teams:
    team.find_element(By.XPATH, 'following-sibling::*').click()
    time.sleep(1)
    players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
    for player in players:
    tds = player.find_elements(By.TAG_NAME, 'td')
    record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
    tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
    data.append(record)

    return data


    def save_data(data):
    # save CSV
    csv_file_path = currentDir + '/output' + current_time + '.csv'
    file = None
    writer = None
    if not os.path.exists(csv_file_path):
    file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
    writer = csv.writer(file)
    writer.writerow(
    ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
    'xA'])
    else:
    file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
    writer = csv.writer(file)
    for record in data:
    writer.writerow(record)
    file.close()


    The output of my script looks like this:



    enter image description here



    So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class



    Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
    enter image description here










    share|improve this question

























      0












      0








      0








      I want to retrieve the match data from the following website:



      https://understat.com/match/81



      I wrote the following script:



      import sys
      import time
      import os
      import io
      import csv

      from selenium import webdriver
      import selenium.webdriver.support.expected_conditions as ec
      import selenium.webdriver.support.ui as ui
      from selenium.common.exceptions import TimeoutException
      from selenium.webdriver.common.by import By

      driver = None
      cnx = None
      currentDir = sys.path[0]

      def scrap_understat():
      init_browser('firefox')

      for i in range(80, 10080):
      try:
      driver.get('https://understat.com/match/' + str(i))
      time.sleep(1)
      if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
      continue

      data = get_match_data()
      save_data(data)

      except Exception as ex:
      log_this(ex)
      print(str(ex))

      close_browser()


      def get_match_data():
      data =

      teams = driver.find_elements(By.NAME, 'team')

      for team in teams:
      team.find_element(By.XPATH, 'following-sibling::*').click()
      time.sleep(1)
      players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
      for player in players:
      tds = player.find_elements(By.TAG_NAME, 'td')
      record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
      tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
      data.append(record)

      return data


      def save_data(data):
      # save CSV
      csv_file_path = currentDir + '/output' + current_time + '.csv'
      file = None
      writer = None
      if not os.path.exists(csv_file_path):
      file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
      writer = csv.writer(file)
      writer.writerow(
      ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
      'xA'])
      else:
      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
      writer = csv.writer(file)
      for record in data:
      writer.writerow(record)
      file.close()


      The output of my script looks like this:



      enter image description here



      So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class



      Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
      enter image description here










      share|improve this question














      I want to retrieve the match data from the following website:



      https://understat.com/match/81



      I wrote the following script:



      import sys
      import time
      import os
      import io
      import csv

      from selenium import webdriver
      import selenium.webdriver.support.expected_conditions as ec
      import selenium.webdriver.support.ui as ui
      from selenium.common.exceptions import TimeoutException
      from selenium.webdriver.common.by import By

      driver = None
      cnx = None
      currentDir = sys.path[0]

      def scrap_understat():
      init_browser('firefox')

      for i in range(80, 10080):
      try:
      driver.get('https://understat.com/match/' + str(i))
      time.sleep(1)
      if try_find_Element(driver, By.CLASS_NAME, 'error-code') is not None:
      continue

      data = get_match_data()
      save_data(data)

      except Exception as ex:
      log_this(ex)
      print(str(ex))

      close_browser()


      def get_match_data():
      data =

      teams = driver.find_elements(By.NAME, 'team')

      for team in teams:
      team.find_element(By.XPATH, 'following-sibling::*').click()
      time.sleep(1)
      players = driver.find_element(By.ID, 'match-rosters').find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')
      for player in players:
      tds = player.find_elements(By.TAG_NAME, 'td')
      record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
      tds[5].text, tds[6].text, tds[7].text, tds[8].text, tds[9].text]
      data.append(record)

      return data


      def save_data(data):
      # save CSV
      csv_file_path = currentDir + '/output' + current_time + '.csv'
      file = None
      writer = None
      if not os.path.exists(csv_file_path):
      file = io.open(csv_file_path, 'w', newline='', encoding='ISO-8859-1')
      writer = csv.writer(file)
      writer.writerow(
      ['player', 'pos', 'min', 'sh', 'g', 'kp', 'a', 'xG',
      'xA'])
      else:
      file = io.open(csv_file_path, 'a', newline='', encoding='ISO-8859-1')
      writer = csv.writer(file)
      for record in data:
      writer.writerow(record)
      file.close()


      The output of my script looks like this:



      enter image description here



      So, there is a problem with the xG- and xA-columns. I only want the lowerscript-part whereas the script takes all the text within the td How do I change my script to only include the first part? By inspecting the page elements, I see that the undesired part is called sub-class



      Second question: How do I get the teamname declared as variable (Manchester United / Tottenham Hotspurs)
      enter image description here







      python selenium web-scraping






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 23 '18 at 8:27









      HJA24HJA24

      13714




      13714
























          2 Answers
          2






          active

          oldest

          votes


















          0














          Try this one to avoid matching sub text:



          record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
          tds[5].text, tds[6].text, tds[7].text,
          driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
          driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]


          To get teamnames you can use



          home = driver.find_element_by_xpath('//label[@for="team-home"]').text
          away = driver.find_element_by_xpath('//label[@for="team-away"]').text


          P.S. Consider to use Waits instead of time.sleep






          share|improve this answer


























          • Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

            – HJA24
            Nov 23 '18 at 9:38











          • @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

            – Andersson
            Nov 23 '18 at 9:48





















          0














          It looks like you just need to remove the sup's:



          driver.execute_script("$('sup').remove()")





          share|improve this answer
























            Your Answer






            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "1"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: true,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: 10,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53443028%2fscrape-value-from-table-of-webpage%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            2 Answers
            2






            active

            oldest

            votes








            2 Answers
            2






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            0














            Try this one to avoid matching sub text:



            record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
            tds[5].text, tds[6].text, tds[7].text,
            driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
            driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]


            To get teamnames you can use



            home = driver.find_element_by_xpath('//label[@for="team-home"]').text
            away = driver.find_element_by_xpath('//label[@for="team-away"]').text


            P.S. Consider to use Waits instead of time.sleep






            share|improve this answer


























            • Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

              – HJA24
              Nov 23 '18 at 9:38











            • @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

              – Andersson
              Nov 23 '18 at 9:48


















            0














            Try this one to avoid matching sub text:



            record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
            tds[5].text, tds[6].text, tds[7].text,
            driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
            driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]


            To get teamnames you can use



            home = driver.find_element_by_xpath('//label[@for="team-home"]').text
            away = driver.find_element_by_xpath('//label[@for="team-away"]').text


            P.S. Consider to use Waits instead of time.sleep






            share|improve this answer


























            • Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

              – HJA24
              Nov 23 '18 at 9:38











            • @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

              – Andersson
              Nov 23 '18 at 9:48
















            0












            0








            0







            Try this one to avoid matching sub text:



            record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
            tds[5].text, tds[6].text, tds[7].text,
            driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
            driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]


            To get teamnames you can use



            home = driver.find_element_by_xpath('//label[@for="team-home"]').text
            away = driver.find_element_by_xpath('//label[@for="team-away"]').text


            P.S. Consider to use Waits instead of time.sleep






            share|improve this answer















            Try this one to avoid matching sub text:



            record = [tds[1].text, tds[2].text, tds[3].text, tds[4].text,
            tds[5].text, tds[6].text, tds[7].text,
            driver.execute_script('return arguments[0].firstChild.textContent', tds[8]),
            driver.execute_script('return arguments[0].firstChild.textContent', tds[9])]


            To get teamnames you can use



            home = driver.find_element_by_xpath('//label[@for="team-home"]').text
            away = driver.find_element_by_xpath('//label[@for="team-away"]').text


            P.S. Consider to use Waits instead of time.sleep







            share|improve this answer














            share|improve this answer



            share|improve this answer








            edited Nov 23 '18 at 8:47

























            answered Nov 23 '18 at 8:38









            AnderssonAndersson

            39.1k113669




            39.1k113669













            • Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

              – HJA24
              Nov 23 '18 at 9:38











            • @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

              – Andersson
              Nov 23 '18 at 9:48





















            • Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

              – HJA24
              Nov 23 '18 at 9:38











            • @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

              – Andersson
              Nov 23 '18 at 9:48



















            Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

            – HJA24
            Nov 23 '18 at 9:38





            Thank you! Your solution works perfectly. One last little question, if I want to specify the corresponding team within the for player in players-loop, how can I do this

            – HJA24
            Nov 23 '18 at 9:38













            @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

            – Andersson
            Nov 23 '18 at 9:48







            @HJA24 , try for team in teams: team_name = team.find_element_by_xpath('./following-sibling::label').text. You can then use team_name in players loop - this value will be the same for all team players

            – Andersson
            Nov 23 '18 at 9:48















            0














            It looks like you just need to remove the sup's:



            driver.execute_script("$('sup').remove()")





            share|improve this answer




























              0














              It looks like you just need to remove the sup's:



              driver.execute_script("$('sup').remove()")





              share|improve this answer


























                0












                0








                0







                It looks like you just need to remove the sup's:



                driver.execute_script("$('sup').remove()")





                share|improve this answer













                It looks like you just need to remove the sup's:



                driver.execute_script("$('sup').remove()")






                share|improve this answer












                share|improve this answer



                share|improve this answer










                answered Nov 23 '18 at 9:22









                pguardiariopguardiario

                36.8k980117




                36.8k980117






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Stack Overflow!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53443028%2fscrape-value-from-table-of-webpage%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    "Incorrect syntax near the keyword 'ON'. (on update cascade, on delete cascade,)

                    Alcedinidae

                    RAC Tourist Trophy