1 Star 0 Fork 2

风中云未飞 / WebScraping_AmazonReviews

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
amazon_users.py 10.72 KB
一键复制 编辑 原始数据 按行查看 历史
lksfr 提交于 2018-10-16 00:10 . Add files via upload
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import time
import csv
#opening Chrome
driver = webdriver.Chrome()
#retrieving the website containing the reviews for the iPhone X
driver.get('https://www.amazon.co.uk/Apple-iPhone-64-SIM-Free-Smartphone-Space-Grey/product-reviews/B076GQZRR9/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=1')
#setting an index to count page numbers
index = 1
#creating a csv file named user_values to save all review data in
csv_file = open('user_values_uk.csv', 'w', encoding='utf-8')
writer = csv.writer(csv_file)
#starting a while-loop that ends once Selenium doesnt find a "Next Page" button on the last review page
while True:
try:
#printing the page index to verify that the loop is working properly
print('scraping page number ' + str(index))
#increasing the page index by one during every loop iteration
index += 1
time.sleep(3)
#finding all review authors, creating a list of these WebElements and finally calculating the length of that list
number_of_users = len(driver.find_elements_by_xpath('//a[@data-hook="review-author"]'))
#starting a for-loop iterating over the length of the review authors list
#finding all reviews and saving them in variable "user_names"
#opening a "review_dict" dictionary to store results in
#subscripting the "user_names" list and retrieving the ith user
for i in range(number_of_users):
user_names = driver.find_elements_by_xpath('//a[@data-hook="review-author"]')
review_dict = {}
user = user_names[i]
#storing the username in the variable user_name
user_name = user.get_attribute("textContent")
#clicking on the user name to go to the user's profile
actions = ActionChains(driver)
actions.move_to_element(user).click().perform()
time.sleep(5)
#finding the total number of "helpful" votes for that user
try:
user_helpful_total = driver.find_element_by_xpath('//div[@class="dashboard-desktop-stats-section"]/div[1]/a[1]/div[1]/div[1]/span[1]').get_attribute("textContent")
except Exception:
user_helpful_total = '0'
#dealing with user that have received more than 1000 "helpful" votes
if 'k' in user_helpful_total:
user_helpful_total = (float(user_helpful_total[:3]))*1000
#finding the number of reviews by that user
try:
user_num_reviews = driver.find_element_by_xpath('//div[@class="dashboard-desktop-stats-section"]/div[2]/a[1]/div[1]/div[1]/span[1]').get_attribute("textContent")
except Exception:
user_num_reviews = '0.001'
if user_num_reviews == '0':
user_num_reviews = '0.001'
#finding the dates on which the user has left a review
user_review_dates = driver.find_elements_by_xpath('//div[@class="a-row a-spacing-mini profile-at-user-info"]')
time.sleep(5)
date_ls = []
#finding each review in user_review_dates and appending it to the date_sl list
for ureview in user_review_dates:
udates = ureview.find_element_by_xpath('./div[1]/div[2]/span[2]').get_attribute("textContent")
date_ls.append(udates)
#finding all review ratings
user_review_ratings = driver.find_elements_by_xpath('//div[@class="a-section profile-at-content"]')
rate_ls = []
headers_ls = []
text_ls = []
#extracting the rating, title, and text from each review and appending it to the lists above
for urating in user_review_ratings:
uratings = urating.find_element_by_xpath('./div[1]/div[1]/i[1]/span[1]').get_attribute("textContent")
rate_ls.append(uratings)
uheaders = urating.find_element_by_xpath('./div[2]/h1[1]/span[1]/span[1]').get_attribute("textContent")
headers_ls.append(uheaders)
utexts = urating.find_element_by_xpath('./div[2]/p[1]/span[1]/span[1]').get_attribute("textContent")
text_ls.append(utexts)
time.sleep(5)
# Calculate Fake Review Index:
# Highest possible score on the Fake Review Index: 100
#
# Understanding the scale:
#
# Score below 10: very unlikely that review is fake
# Score above 10 and below 20: unlikely that review is fake
# Score above 20 and below 30: most likely not fake but occassionally might be if fake is hard to identify
# Score above 30 and below 40: grey area, could be either
# Score above 40 and below 50: likely fake
# Score above 50: very likely to be fake
#
# Used indicators for calculating the Fake Review Index:
#
# total number of helpful votes: if a user gets less than or equal to one helpful votes
# it's more likely that the review is not genuine and thus not helpful
# (max=10):
# user_helpful_total/user_num_reviews <= 1: +10
# user_helpful_total/user_num_reviews > 1 and <=2: +8
# user_helpful_total/user_num_reviews > 3 and <=4: +4
# user_helpful_total/user_num_reviews > 5 and <=8: +2
# user_helpful_total/user_num_reviews > 8: +0
#
#
#
# total number of reviews: if a user leaves very little reviews, then he/she is most likely
# not an active member of the Amazon review community(max=20):
#
# user_num_reviews = 1: +20
# user_num_reviews >1 and <=5: +15
# user_num_reviews >5 and <=15: +10
# user_num_reviews >15 and <= 30: +7
# user_num_reviews >30 and <= 50: +3
# user_num_reviews >30 and <= 50: +0
#
#
# review dates: if a user leaves several reviews on the same date, these are more likely to be
# fake (max=17.5):
#
# max([date_ls.count(x) for x in date_ls]) > 4: +17.5
# max([date_ls.count(x) for x in date_ls]) = 3: +14
# max([date_ls.count(x) for x in date_ls]) = 2: +10
# max([date_ls.count(x) for x in date_ls]) = 1: +0
#
#
#
# ratings: a user is more likely to be fake if he/she only leaves either 5 star or 1 star reviews (max=17.5):
#
# ratings_int = [int(x[0]) for x in rate_ls]
#
# ratings_int.count(5) or ratings_int.count(1) > 15: +17.5
# ratings_int.count(5) or ratings_int.count(1) >=10 and < 15: +12
# ratings_int.count(5) or ratings_int.count(1) >5 and <= 10: +7
# ratings_int.count(5) or ratings_int.count(1) <5: +3
#
#
#
# review titles: a user is more likely to be fake if he/she repeatedly uses the same review title (max=17.5):
#
# max([headers_ls.count(x) for x in headers_ls]) > 4: +17.5
# max([headers_ls.count(x) for x in headers_ls]) = 3: +14
# max([headers_ls.count(x) for x in headers_ls]) = 2: +10
# max([headers_ls.count(x) for x in headers_ls]) = 1: +0
#
#
# review texts: (max=17.5): a user is more likely to be fake if he/she uses the same review text repeatedly
#
# max([text_ls.count(x) for x in text_ls]) > 4: +17.5
# max([text_ls.count(x) for x in text_ls]) = 3: +14
# max([text_ls.count(x) for x in text_ls]) = 2: +10
# max([text_ls.count(x) for x in text_ls]) = 1: +0
#
#
#
#
#
fake_index = 0
#helpful votes
if float(user_num_reviews) == 0:
fake_index += 0
if float(user_helpful_total)/float(user_num_reviews) <= 1:
fake_index += 10
elif float(user_helpful_total)/float(user_num_reviews) > 1 and float(user_helpful_total)/float(user_num_reviews) <=2:
fake_index += 8
elif float(user_helpful_total)/float(user_num_reviews) > 3 and float(user_helpful_total)/float(user_num_reviews) <=4:
fake_index += 4
elif float(user_helpful_total)/float(user_num_reviews) > 5 and float(user_helpful_total)/float(user_num_reviews) <=8:
fake_index += 2
elif float(user_helpful_total)/float(user_num_reviews) > 8:
fake_index += 0
time.sleep(2)
#number of reviews
if float(user_num_reviews) == 1:
fake_index += 20
elif float(user_num_reviews) >1 and float(user_num_reviews) <=5:
fake_index += 15
elif float(user_num_reviews) >5 and float(user_num_reviews) <=15:
fake_index += 10
elif float(user_num_reviews) >15 and float(user_num_reviews) <=30:
fake_index += 7
elif float(user_num_reviews) >30 and float(user_num_reviews) <=50:
fake_index += 4
elif float(user_num_reviews) >50:
fake_index += 0
#review dates
try:
max_date = max([date_ls.count(x) for x in date_ls])
except Exception:
max_date == 0
if max_date == "":
max_date == 0
if max_date >= 4:
fake_index += 17.5
elif max_date == 3:
fake_index +=14
elif max_date == 2:
fake_index +=10
elif max_date <= 1:
fake_index +=0
#ratings
ratings_int = [int(x[0]) for x in rate_ls]
if (ratings_int.count(5) or ratings_int.count(1)) > 15:
fake_index += 17.5
elif (ratings_int.count(5) or ratings_int.count(1)) >=10 and (ratings_int.count(5) or ratings_int.count(1)) < 15:
fake_index += 12
elif (ratings_int.count(5) or ratings_int.count(1)) >=5 and (ratings_int.count(5) or ratings_int.count(1)) < 10:
fake_index += 7
elif (ratings_int.count(5) or ratings_int.count(1)) < 5:
fake_index += 3
#review titles
try:
review_title = max([headers_ls.count(x) for x in headers_ls])
except Exception:
review_title == 0
if review_title == "":
review_title == 0
if review_title >= 4:
fake_index += 17.5
elif review_title == 3:
fake_index += 14
elif review_title == 2:
fake_index += 10
elif review_title == 1:
fake_index += 0
#review texts
try:
review_texts = max([text_ls.count(x) for x in text_ls])
except Exception:
review_texts == 0
if review_texts == "":
review_texts == 0
if review_texts >= 4:
fake_index += 17.5
elif review_texts == 3:
fake_index += 14
elif review_texts == 2:
fake_index += 10
elif review_texts == 1:
fake_index += 0
#saving results in a dictionary and writing it to a csv
review_dict['user_name'] = user_name
review_dict['fake_review_index'] = fake_index
writer.writerow(review_dict.values())
#returning to the "all reviews" website
driver.execute_script("window.history.go(-1)")
time.sleep(5)
#finding the "Next Page" button and clicking it
next_button = driver.find_element_by_xpath('//li[@class="a-last"]')
next_button.click()
time.sleep(5)
#print exception if there should be one
except Exception as e:
print(e)
break
1
https://gitee.com/www_520dn_cn/WebScraping_AmazonReviews.git
git@gitee.com:www_520dn_cn/WebScraping_AmazonReviews.git
www_520dn_cn
WebScraping_AmazonReviews
WebScraping_AmazonReviews
master

搜索帮助