Julio Sotelo | Amazon reviews

Are you looking to gain deeper insights into customer sentiments, product performance, and trends on Amazon? Discover how to supercharge your ability to gather and analyze Amazon product reviews, ratings, and comments. Here’s what you need to do:

1. Web Scraping: Equip yourself with the skills and tools to efficiently retrieve product information from Amazon, including reviews, ratings, and comments.

2. Data Extraction and Processing: Clean and process the unstructured text of reviews and comments using natural language processing techniques.

3. Sentiment Analysis: Categorize reviews as positive, negative, or neutral by incorporating NLP libraries or APIs like NLTK or VADER.

4. Data Storage: Set up a database or cloud storage to keep your data organized and accessible.

5. Automation and Scheduling: Automate the process of regularly retrieving new reviews and comments to stay updated.

6. Power BI Integration: Export cleaned and analyzed data to Power BI for visualization. Create visually appealing charts, graphs, and dashboards to represent your findings.

7. Visualization Design: Design your Power BI reports for maximum impact, using visuals like word clouds and stacked bar charts.

8. User Interface: Build a user-friendly interface or dashboard to interact with your data. This could be a personal tool or a client-facing solution.

9. Error Handling: Implement error-handling mechanisms for any changes in the Amazon website structure or data extraction issues.

10. Compliance and Ethics: Ensure compliance with Amazon’s terms of service and ethical guidelines related to data scraping.

11. Scalability: Make sure your solution can handle a growing amount of data and products.

By following these steps, you can gain valuable insights into Amazon customer sentiments and product performance. Start your journey to data-driven decision-making today. Happy analyzing!”

Formatted Python Code

       import pandas as pd
import numpy as np
#from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from time import sleep
#from selenium.webdriver.common.keys import Keys

import json, numpy ,time
from datetime import datetime
from math import ceil
import re, os
os.chdir('/Users/jc_juls/Documents/Projects/Pulsar/Crawlers/Amazon/Reviews/')

#import glob
#import os
import requests
import random as rand



#chrome_driver_path =  r"C:\Restore\chromedriver"
#chrome_driver_path = "/Users/Julio/Documents/expertis/CocinaLibre/Miami/chromedriver"
chrome_driver_path = "/Users/jc_juls/Documents/Crawlers/Chromewebdriver/chromedriver"
#download_folder = r"C:\\Users\\Administrator\Downloads\\"
#results_folder = r"C:\\Restore\Helium\\"

browser = None

def OpenBrowser():
    global browser
    global chrome_driver_path
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
    options.add_argument('--disable-gpu') 
    #options.add_argument('--headless') # hide google user interface
    options.add_argument('--no-sandbox')
    options.add_argument("window-size=1920,1080")
    chrome_driver_path = chrome_driver_path   
    browser = webdriver.Chrome(executable_path=chrome_driver_path, options= options) # el dirver es especifico para linux
    browser.set_window_size(1440, 1080)
        
    browser.get('https://www.amazon.com/dp/B08SCNJBRR')
    try:
        browser.find_element(By.ID,'nav-global-location-popover-link').click()
    except:        
        # input US ZIP Code to get price
        browser.find_element(By.XPATH,'//input[contains(@data-action-type,"SELECT_LOCATION")]').click()
    sleep(3)
    browser.find_element(By.XPATH,'//input[contains(@aria-label,"or enter")]').click()
    sleep(3)
    inpu = browser.find_element(By.XPATH,'//input[contains(@aria-label,"or enter")]')
    sleep(3)
    inpu.send_keys('33139')
    sleep(2)
    inpu.send_keys(Keys.ENTER)
    sleep(2)
    try:
        browser.find_elements(By.XPATH,'//input[contains(@id,"GLUXConfirmClose")]')[0].click()
    except:
        browser.find_elements(By.XPATH,'//input[contains(@id,"GLUXConfirmClose")]')[-1].click()


        
def RecordRating(val):
    if '1' in val[0]:
        col = '1_star_pct'
    if '2' in val[0]:
        col = '2_star_pct'
    if '3' in val[0]:
        col = '3_star_pct'
    if '4' in val[0]:
        col = '4_star_pct'
    if '5' in val[0]:
        col = '5_star_pct'
    value = int(re.sub("[^0123456789.\']","",val[1]))
    value = value / 100
    return col, value

def GetReviewRating(rev,v='1'):
    for v in range(1,6):
        try:
            rev.find_element(By.XPATH,".//i[contains(@class,'a-star-{}')]".format(v))
            return int(v)
        except:
            continue
       
def ProcessReviews(reviews):
    global rev_data, browser
    for rev in reviews:
        if 'global ratings' == rev.text[-14:] or 'with reviews' == rev.text[-12:] or 'total ratings' == rev.text[-13:]:
            #print('\t Continue\n\t'+rev.text)            
            continue
        rid = rev.find_element(By.XPATH,".//div[contains(@id,'customer_review')]").get_attribute('id').replace('customer_review-','')
        if rid not in rev_data.rev_id.unique():
            genome = rev.find_element(By.XPATH,".//div[contains(@data-hook,'genome-widget')]").text
            rev_rating = GetReviewRating(rev)
            review_date = rev.find_element(By.XPATH,".//span[contains(@data-hook,'review-date')]").text
            verified = 1 if 'Verified Purchase' in rev.text else 0
            review_body = rev.find_element(By.XPATH,".//span[contains(@data-hook,'review-body')]").text
            
            #print(rev_rating)
            i_rev = rev_data.shape[0]
            rev_data.at[i_rev,'rev_id'] = rid
            rev_data.at[i_rev,'ASIN'] = asin
            rev_data.at[i_rev,'updated_date'] = datetime.now().strftime('%Y-%m-%d')
            rev_data.at[i_rev,'genome'] = genome
            rev_data.at[i_rev,'rating'] = rev_rating
            rev_data.at[i_rev,'date'] = review_date
            rev_data.at[i_rev,'verified'] = verified
            rev_data.at[i_rev,'body'] = review_body
        else:
            print('\tAlready in DB '+asin + ' id: '+ rid)
            return -1
    return True

def GetPrice():
    global browser
    try:
        price = browser.find_element(By.ID,'apex_offerDisplay_desktop')
        try:
            price = price.find_element(By.CLASS_NAME,By.CLASS_NAME,'a-price-whole').text +'.'+price.find_element(By.CLASS_NAME,'a-price-fraction').text    
        except:
            try:
                price = price.find_elements(By.XPATH,'.//span[contains(@class,"a-offscreen")]')[0]
                price = re.sub("[^0123456789.\']","",price.get_attribute('innerHTML'))
            except:
                price = ''
                pass
    except:
        price = ''
    return price


def GetBrand():
    global browser
    try:
        browser.find_element(By.XPATH,'//img[contains(@alt,"Sorry")]')        
        brand = ''
    except:
        sleep(rand.randint(18, 20)/10)
        try:
            brand = browser.find_element(By.CLASS_NAME,'po-brand')    
            brand = brand.find_elements_by_css_selector('span')[-1].text
        except:
            try:
                brand = browser.find_element(By.ID,'bylineInfo')    
                brand = brand.text.replace('Brand: ','').strip()
                brand = brand.replace('Visit the ','')
                brand = brand.replace('Store','').strip()
            except:
                brand  = ''
    return brand

def GetDimensions():
    global browser
    try:
        dim = browser.find_element(By.ID,'detailBullets_feature_div')
        for d in dim.find_elements_by_css_selector('li'):
            if 'Product Dimensions' in d.text:
                break
        dim = d.find_elements_by_css_selector('span')[-1].text
        dim_sour = 'product'
    except:
        try:
            dim = browser.find_element(By.ID,'productDetails_techSpec_section_1')
            for d in dim.find_elements_by_css_selector('tr'):
                print(d.text)
                if 'Dimensions' in d.text:
                    break
            dim = d.find_elements_by_css_selector('td')[-1].text
            dim_sour = 'technical'
        except:
            try:
                dim = browser.find_element(By.ID,'productDetails_detailBullets_sections1')
                for d in dim.find_elements_by_css_selector('tr'):
                    print(d.text)
                    if 'Dimensions' in d.text:
                        break
                dim = d.find_elements_by_css_selector('td')[-1].text
                dim_sour = 'package'
            except:
                dim = ''
                dim_sour = ''
    return dim, dim_sour

# In[Open browser]:
OpenBrowser()



# In[get asins from store front]

def GetASINSfromStoreFront(store_url):

    #store_url = 'https://www.amazon.com/stores/page/944190B1-1127-4528-AEBB-4F3D21ED22C3?ingress=0&visitId=1b6e2d60-3194-4d48-b493-5dbee2337b30'
    

    browser.get(store_url)
    
    all_products_view = browser.find_element(By.XPATH, '//span[contains(text(),"All Products")]').find_element(By.XPATH, '..')
    all_products_view.click()
    
    WebDriverWait(browser,60).until(EC.presence_of_element_located((By.XPATH, '//li[contains(@class,"ProductGridItem")]')))
    
    products = browser.find_elements(By.XPATH, '//li[contains(@class,"ProductGridItem")]' )
    len(products)
    
    asins_list = []
    for p in products:
        hrefs = p.find_elements(By.CSS_SELECTOR,"a")
        for h in hrefs:
            if 'dp' in h.get_attribute('href'):
                asin = h.get_attribute('href')
                asin = asin.split('dp/')[1].split('?')[0]
                if asin not in asins_list:
                    asins_list.append(asin)

    return asins_list

brand = 'Tru Lite bedding'
store_url = 'https://www.amazon.com/stores/page/944190B1-1127-4528-AEBB-4F3D21ED22C3?ingress=0&visitId=1b6e2d60-3194-4d48-b493-5dbee2337b30'
asins = GetASINSfromStoreFront(store_url)

asins_df = pd.DataFrame()
asins_df['asin'] = asins
asins_df['brand'] = brand

asins_df.to_csv(f'Raw data/Brands/{brand}/asins_list.csv',index=False)



# In[set partition]:

##################
##################
'UPDATE RUN NUMBER '
run= 'run3'
#best_selling_asins = pd.read_csv('best selling asins.csv')
"""soap_asins = pd.read_parquet('Raw data/Soap_ASINs.parquet')
soap_asins = soap_asins[soap_asins['Categories: Sub'] == 'Soaps'].copy()
soap_asins[['ASIN','Sales Rank: 90 days avg.']].to_csv('Raw data/Soap_asins_only_soap.csv')"""
#soap_asins = pd.read_csv('Raw data/Soap_asins_only_soap.csv')
category = 'Bath_salts'
file_name = 'bath_salts_asins.csv'
soap_asins = pd.read_csv(f'Raw data/{file_name}')
try:
    os.listdir('Raw data/'+category)
except FileNotFoundError:
    os.makedirs('Raw data/'+category)
    os.makedirs('Raw data/'+category+'/asins_in_process')

files = os.listdir(f'Raw data/{category}/asins_in_process')

runs = pd.DataFrame()
for f in files:
    if f.startswith('run'):
        temp =pd.read_csv(f'Raw data/{category}/asins_in_process/'+f)
        runs = pd.concat([temp,runs])
if runs.shape[0]>0:
    asins_to_process = soap_asins[~soap_asins.ASIN.isin(runs.ASIN)]
else:
    asins_to_process = soap_asins
asins_to_process = asins_to_process['ASIN'].iloc[0:500]
asins_to_process = pd.DataFrame(asins_to_process)
asins_to_process.to_csv(f'Raw data/{category}/asins_in_process/'+run+'.csv',index=False)

NUM_REVIEWS_TO_GET_PER_ASIN = 5000
# asins_to_process = pd.read_csv('Raw data/Soap_data/asins_in_process/'+run+'.csv')

# In[]:
    
base=f'/Users/Julio/Documents/expertis/Spiders/Amazon/Raw data/{category}/asins_in_process/'
base_asin_url = 'https://www.amazon.com/dp/{}'
asin_data = pd.DataFrame(columns=['ASIN','brand','price'])
rev_data = pd.DataFrame(columns=['rev_id','ASIN','updated_date','genome','rating','date','verified','body'])
#asin_data = pd.read_csv('asin_data.csv')
#rev_data = pd.read_csv('Raw data/Process2/rev_data_p2_1000-1500.csv')
#basepath
ref = 0
# In[Continue]:
i_d = asin_data.shape[0]
for asin in asins_to_process[ref:].ASIN:
    print('Starting: {} | {:%}'.format(asin,ref/asins_to_process.shape[0]))#best_selling_asins.shape[0]))
    browser.get(base_asin_url.format(asin))
    am_asin = browser.current_url.split('https://www.amazon.com/dp/')[-1].split('?')[0]
    v = requests.get('https://reviewmeta.com/api/amazon/{asin}'.format(asin=am_asin))
    sleep(rand.randint(8,12)/10)
    try:
        re_meta = v.json()
    except:
        re_meta = {}    
    
    sleep(rand.randint(48, 60)/10)
    i_d += 1
    try:
        total_rating_count = browser.find_element(By.XPATH,"//div[contains(@data-hook,'total-review-count')]")
        total_rating_count = int(re.sub("[^0123456789.\']","",total_rating_count.text))
    except:
        ref+=1
        continue
    asin_data.at[i_d,'ASIN']= asin 
    asin_data.at[i_d,'total_rating_count']= total_rating_count
    
    rating = browser.find_elements_by_id('histogramTable')[-1]
    rating = rating.find_elements_by_css_selector('tr')
    
    for r in rating:
        val = r.text
        val = val.splitlines()
        if len(val)==2:
            col ,value = RecordRating(val)
            asin_data.at[i_d,col]= value
    
    #GET PRICE IF AVAILABLE
    price = GetPrice()
    brand = GetBrand()
    dim , dim_sour = GetDimensions()
            
    asin_data.at[i_d,'dimensions_source'] = dim_sour
    asin_data.at[i_d,'dimensions'] = dim
    
    # see all reviews clcik
    sleep(rand.randint(5, 15)/10)
    try:
        browser.find_element(By.XPATH,"//a[contains(@data-hook,'see-all-reviews-link-foot')]").click()
    except:
        try:
            element = browser.find_element(By.XPATH,"//a[contains(text(),'See all reviews')]")
            actions = ActionChains(browser)
            actions.move_to_element(element).perform()
            sleep(.8)
            element.click()
        except:
            try:
                browser.find_element(By.XPATH,"//a[contains(text(),'Next page')]")
            except:
                ref+=1
                continue
    sleep(rand.randint(10, 30)/10)
    try:
        numer_of_reviews = browser.find_element(By.XPATH,"//div[contains(@data-hook,'cr-filter-info-review-rating-count')]")
        numer_of_reviews = numer_of_reviews.text.split('total ratings,')[-1]
        numer_of_reviews = int(re.sub("[^0123456789.\']","",numer_of_reviews))
    except:
        try:
            browser.find_element(By.XPATH,"//a[contains(@data-hook,'see-all-reviews-link-foot')]").click()
            sleep(rand.randint(30, 50)/10)
            numer_of_reviews = browser.find_element(By.XPATH,"//div[contains(@data-hook,'cr-filter-info-review-rating-count')]")
            numer_of_reviews = numer_of_reviews.text.split('total ratings,')[-1]
            numer_of_reviews = int(re.sub("[^0123456789.\']","",numer_of_reviews))
        except:
            ref+=1
            continue
    asin_data.at[i_d,'numer_of_reviews']= numer_of_reviews
    asin_data.at[i_d,'updated_date'] = datetime.now().strftime('%Y-%m-%d')
    asin_data.at[i_d,'price'] = price
    asin_data.at[i_d,'brand'] = brand

    asin_data.at[i_d,'rev_meta_rating'] = None if re_meta.get('rating') in ['','N/A'] else re_meta.get('rating')
    asin_data.at[i_d,'rev_meta_overall'] = None if re_meta.get('s_overall') in ['','N/A'] else re_meta.get('s_overall')
    asin_data.at[i_d,'rev_meta_count'] = None if re_meta.get('count') in ['','N/A'] else re_meta.get('count')
    
    #for n_rev_page in range(min(15,ceil(numer_of_reviews/10))):
    #for n_rev_page in range(10,ceil(numer_of_reviews/10)+1):
    numer_of_reviews = min(NUM_REVIEWS_TO_GET_PER_ASIN,numer_of_reviews)
    for n_rev_page in range(ceil(numer_of_reviews/10)):
        reviews = browser.find_elements(By.XPATH,"//div[contains(@data-hook,'review')]")
        flag=False
        while flag is False:
            try:
                flag = ProcessReviews(reviews)
            except :
                flag = True
        #if flag == -1:
        #    break
        if n_rev_page == ceil(numer_of_reviews/10)-1:
            pass
        else:
            sleep(rand.randint(30, 50)/10)
            try:
                browser.find_element(By.XPATH,"//a[contains(text(),'Next page')]").click()
            except:
                continue
        sleep(rand.randint(18, 40)/10)
    sleep(rand.randint(5, 10)/10)
    print('{} num of reviews {} | {:%}'.format(asin,numer_of_reviews,ref/asins_to_process.shape[0]))#best_selling_asins.shape[0]))
    ref+=1
    
    asin_data.to_parquet(base+'asins_data_'+run+'.parquet.gzip',compression='gzip',index=False)
    rev_data.to_parquet(base+'rev_data_'+run+'.parquet.gzip',compression='gzip',index=False)
asin_data.to_parquet(base+'asins_data_'+run+'.parquet.gzip',compression='gzip',index=False)
rev_data.to_parquet(base+'rev_data_'+run+'.parquet.gzip',compression='gzip',index=False)



print('Total asin reviews {}\nReviews collected: {} |{:%}'.format(asin_data.numer_of_reviews.sum(),
                                                                  rev_data.shape[0],
                                                                  rev_data.shape[0]/asin_data.numer_of_reviews.sum()))
#browser.close()

By Julio Python Social Network TSA Web crawling