Python BeautifulSoup Parsing Script Tags
I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re
email = 'email'
password = 'password'
driver = webdriver.Chrome()
driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)
#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')
text = str(soup.find_all('script')[13])
value = re.findall(r'DisplayName', text)
print(value)
I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.
python regex selenium beautifulsoup python-requests
add a comment |
I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re
email = 'email'
password = 'password'
driver = webdriver.Chrome()
driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)
#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')
text = str(soup.find_all('script')[13])
value = re.findall(r'DisplayName', text)
print(value)
I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.
python regex selenium beautifulsoup python-requests
Copy the display name code and paste it here. You're getting the display name withre
but nothing behind it. Send the code and I'll help you fix that.
– Kamikaze_goldfish
Nov 21 '18 at 3:25
add a comment |
I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re
email = 'email'
password = 'password'
driver = webdriver.Chrome()
driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)
#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')
text = str(soup.find_all('script')[13])
value = re.findall(r'DisplayName', text)
print(value)
I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.
python regex selenium beautifulsoup python-requests
I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re
email = 'email'
password = 'password'
driver = webdriver.Chrome()
driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)
#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')
text = str(soup.find_all('script')[13])
value = re.findall(r'DisplayName', text)
print(value)
I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.
python regex selenium beautifulsoup python-requests
python regex selenium beautifulsoup python-requests
asked Nov 21 '18 at 1:08
otterdogotterdog
4917
4917
Copy the display name code and paste it here. You're getting the display name withre
but nothing behind it. Send the code and I'll help you fix that.
– Kamikaze_goldfish
Nov 21 '18 at 3:25
add a comment |
Copy the display name code and paste it here. You're getting the display name withre
but nothing behind it. Send the code and I'll help you fix that.
– Kamikaze_goldfish
Nov 21 '18 at 3:25
Copy the display name code and paste it here. You're getting the display name with
re
but nothing behind it. Send the code and I'll help you fix that.– Kamikaze_goldfish
Nov 21 '18 at 3:25
Copy the display name code and paste it here. You're getting the display name with
re
but nothing behind it. Send the code and I'll help you fix that.– Kamikaze_goldfish
Nov 21 '18 at 3:25
add a comment |
1 Answer
1
active
oldest
votes
So the reason you're not getting anything is because you're telling re
to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re
find the DisplayName but the .*
find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.
import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53403912%2fpython-beautifulsoup-parsing-script-tags%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
So the reason you're not getting anything is because you're telling re
to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re
find the DisplayName but the .*
find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.
import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))
add a comment |
So the reason you're not getting anything is because you're telling re
to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re
find the DisplayName but the .*
find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.
import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))
add a comment |
So the reason you're not getting anything is because you're telling re
to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re
find the DisplayName but the .*
find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.
import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))
So the reason you're not getting anything is because you're telling re
to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re
find the DisplayName but the .*
find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.
import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))
answered Nov 21 '18 at 3:56
Kamikaze_goldfishKamikaze_goldfish
453311
453311
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53403912%2fpython-beautifulsoup-parsing-script-tags%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Copy the display name code and paste it here. You're getting the display name with
re
but nothing behind it. Send the code and I'll help you fix that.– Kamikaze_goldfish
Nov 21 '18 at 3:25