Python BeautifulSoup Parsing Script Tags

I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import requests

import time

from bs4 import BeautifulSoup

import json

import re



email = 'email'

password = 'password'



driver = webdriver.Chrome()



driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')

time.sleep(3)

driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')

print('Grabbing Cookies')

time.sleep(5)





headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}



s = requests.Session()

s.headers.update(headers)



for cookie in driver.get_cookies():

    c = {cookie['name'] : cookie['value']}

    s.cookies.update(c)



#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')





soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')



text = str(soup.find_all('script')[13])



value = re.findall(r'DisplayName', text)



print(value)

I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.

asked Nov 21 '18 at 1:08

otterdog

4917

Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25

add a comment |

I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import requests

import time

from bs4 import BeautifulSoup

import json

import re



email = 'email'

password = 'password'



driver = webdriver.Chrome()



driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')

time.sleep(3)

driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')

print('Grabbing Cookies')

time.sleep(5)





headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}



s = requests.Session()

s.headers.update(headers)



for cookie in driver.get_cookies():

    c = {cookie['name'] : cookie['value']}

    s.cookies.update(c)



#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')





soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')



text = str(soup.find_all('script')[13])



value = re.findall(r'DisplayName', text)



print(value)

asked Nov 21 '18 at 1:08

otterdog

4917

Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25

add a comment |

I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import requests

import time

from bs4 import BeautifulSoup

import json

import re



email = 'email'

password = 'password'



driver = webdriver.Chrome()



driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')

time.sleep(3)

driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')

print('Grabbing Cookies')

time.sleep(5)





headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}



s = requests.Session()

s.headers.update(headers)



for cookie in driver.get_cookies():

    c = {cookie['name'] : cookie['value']}

    s.cookies.update(c)



#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')





soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')



text = str(soup.find_all('script')[13])



value = re.findall(r'DisplayName', text)



print(value)

asked Nov 21 '18 at 1:08

otterdog

4917

I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import requests

import time

from bs4 import BeautifulSoup

import json

import re



email = 'email'

password = 'password'



driver = webdriver.Chrome()



driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')

time.sleep(3)

driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)

time.sleep(5)

driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()

time.sleep(5)

driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')

print('Grabbing Cookies')

time.sleep(5)





headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}



s = requests.Session()

s.headers.update(headers)



for cookie in driver.get_cookies():

    c = {cookie['name'] : cookie['value']}

    s.cookies.update(c)



#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')





soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')



text = str(soup.find_all('script')[13])



value = re.findall(r'DisplayName', text)



print(value)

python regex selenium beautifulsoup python-requests

asked Nov 21 '18 at 1:08

otterdog

4917

asked Nov 21 '18 at 1:08

otterdog

4917

asked Nov 21 '18 at 1:08

otterdog

4917

asked Nov 21 '18 at 1:08

otterdog

4917

asked Nov 21 '18 at 1:08

otterdog

4917

Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25

add a comment |

Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25

Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25

add a comment |

1 Answer
1

active

oldest

votes

So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.

import re



url = "DisplayName='PoppaBear4'"



info = re.findall(r"DisplayName=.*'", url)

print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53403912%2fpython-beautifulsoup-parsing-script-tags%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

import re



url = "DisplayName='PoppaBear4'"



info = re.findall(r"DisplayName=.*'", url)

print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

add a comment |

import re



url = "DisplayName='PoppaBear4'"



info = re.findall(r"DisplayName=.*'", url)

print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

add a comment |

import re



url = "DisplayName='PoppaBear4'"



info = re.findall(r"DisplayName=.*'", url)

print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

import re



url = "DisplayName='PoppaBear4'"



info = re.findall(r"DisplayName=.*'", url)

print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

answered Nov 21 '18 at 3:56

Kamikaze_goldfish

453311

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Argthtjtr