Vestiaire Collective is a luxury fashion resale platform from France. It's a popular web scraping target as it's one of the biggest second-hand markets for luxury fashion items.
In this tutorial, we'll take a quick look at how to scrape Vestiaire Collective using Python. In this guide we'll cover:
- Scrape Vestiaire Collective product listing data.
- Find product listings using Vestiaire Collective sitemaps.
This is a very easy scraper as we'll be using hidden web data scraping to effortlessly collect product and seller data.
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Why Scrape Vestiaire Collective?
Vestiaire Collective is a major exchange for luxury fashion items. Scraping this website can be useful for a number of reasons:
- Luxury fashion market analysis
- Competitive analysis
- Market prediction
For more on web scraping uses see our web scraping use case hub.
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Scrape Preview
We'll be scraping the whole product dataset available on Vestiaire Collective which includes:
- Product details like name, descriptions and features.
- Product media (photos, videos).
- Product pricing.
- Seller details.
Here's an example dataset we'll be collecting with our Python scraper:
Example Product Dataset
{
"id": "32147447",
"type": "product",
"name": "Sweatshirt",
"price": {
"currency": "CAD",
"cents": 23033,
"formatted": "CDN$230.33"
},
"isLocal": true,
"description": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"likeCount": 3,
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml",
"sold": false,
"reserved": false,
"negotiable": true,
"inStock": false,
"measurementFormatted": "Size: 8 US",
"receipt": false,
"available": true,
"consignment": false,
"prohibited": false,
"localizedDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"originalDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"originalDescriptionLanguage": "en",
"metadata": {
"title": "Sweatshirt Anine Bing Beige size 8 US in Cotton - 32147447",
"description": "Buy your sweatshirt Anine Bing on Vestiaire Collective, the luxury consignment store online. Second-hand Sweatshirt Anine Bing Beige in Cotton available. 32147447",
"keywords": "Anine Bing Cotton Knitwear"
},
"warehouse": {
"name": "Brooklyn",
"localizedName": "Brooklyn"
},
"pictures": [
{
"alt": "Sweatshirt Anine Bing",
"path": "32147447-1_2.jpg"
},
{
"alt": "Buy Anine Bing Sweatshirt online",
"path": "32147447-2_2.jpg"
},
{
"alt": "Luxury Anine Bing Knitwear Women ",
"path": "32147447-3_2.jpg"
},
{
"alt": "Second hand Clothing Women ",
"path": "32147447-4_2.jpg"
},
{
"alt": "Sweatshirt Anine Bing",
"path": "32147447-5_2.jpg"
}
],
"size": {
"id": "7",
"type": "size",
"size": "8",
"standard": "US",
"localizedStandard": "US"
},
"brand": {
"id": "5344",
"type": "brand",
"name": "Anine Bing",
"localizedName": "anine bing",
"url": {
"original": "http://vestiairecollective.com/anine-bing/",
"path": "/anine-bing/",
"url": "http://vestiairecollective.com/anine-bing/"
}
},
"material": {
"id": "2",
"type": "material",
"name": "Cotton",
"localizedName": "Cotton"
},
"color": {
"id": "2",
"type": "color",
"name": "Beige",
"localizedName": "Beige"
},
"condition": {
"id": "",
"type": "condition",
"description": "Very good condition"
},
"universe": {
"id": "1",
"type": "universe",
"name": "Women",
"localizedName": "Women"
},
"category": {
"id": "56",
"type": "category",
"name": "Knitwear",
"localizedName": "Knitwear",
"parent": {
"id": "2",
"type": "category",
"name": "Clothing",
"localizedName": "Clothing"
}
},
"subcategory": {
"id": "17",
"type": "subcategory",
"name": "Sweatshirts",
"localizedName": "Sweatshirts"
},
"season": {
"id": "3",
"type": "season",
"name": "All seasons",
"localizedName": "All seasons"
},
"model": {
"id": "0",
"type": "model",
"name": "",
"localizedName": ""
},
"seller": {
"id": "9797796",
"type": "user",
"firstname": "kate",
"username": "kate9797796",
"hyperwalletActive": false,
"alreadyDepositedAProduct": false,
"mood": "",
"country": "United States",
"countryISO": "US",
"civility": {
"name": "miss",
"localizedName": "miss",
"idGender": 3
},
"language": {
"name": "en",
"localizedName": "en",
"code": "en"
},
"hasWallet": false,
"badges": [
"recommended",
"direct-shipping",
"expert-seller"
],
"statistics": {
"productsWished": 0,
"productsSold": 126,
"productsListed": 585,
"productsBought": 0,
"passRate": 90,
"usuallyShipsWithin": "1-2 days"
},
"sellerRating": {
"badge": "Expert",
"goals": {
"conformity": 1,
"cx": 0,
"shipping": 0.93,
"volume": 32,
"tags": {
"volume": true,
"shipping": true,
"conformity": true
}
},
"goalsThresholds": [
{
"category": "volume",
"max_value": 5,
"thresholds": [
{
"label": "Trusted",
"value": 2
},
{
"label": "Expert",
"value": 5
}
]
},
{
"category": "conformity",
"max_value": 1,
"thresholds": [
{
"label": "Trusted",
"value": 0.8
},
{
"label": "Expert",
"value": 0.9
}
]
},
{
"category": "shipping",
"max_value": 1,
"thresholds": [
{
"label": "Trusted",
"value": 0.8
},
{
"label": "Expert",
"value": 0.9
}
]
}
],
"achievementsGoals": [
{
"category": "volume",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
},
{
"category": "conformity",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
},
{
"category": "shipping",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
}
]
},
"picture": {
"path": "/profil/missing_avatar.gif"
},
"social": {
"nbFollowers": 225,
"nbFollows": 7,
"productsLiked": 331,
"communityRank": 6914,
"followed": false
},
"vacation": {
"active": false
},
"segment": "C2C"
},
"creationDate": "2023-03-30T20:34:48Z",
"meshLinks": {
"topCategory": {
"name": "Women Clothing",
"localizedName": "Women Clothing",
"url": {
"url": "http://vestiairecollective.com//women-clothing/",
"path": "/women-clothing/"
}
},
"category": {
"name": "Knitwear",
"localizedName": "Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/",
"path": "/women-clothing/knitwear/"
}
},
"categoryBrand": {
"name": "Anine Bing Knitwear",
"localizedName": "Anine Bing Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/",
"path": "/women-clothing/knitwear/anine-bing/"
}
},
"categoryBrandModelMaterial": {
"name": "Anine Bing Cotton Knitwear",
"localizedName": "Anine Bing Cotton Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/cotton/",
"path": "/women-clothing/knitwear/anine-bing/cotton/"
}
}
},
"alternateVersions": [
{
"language": "de",
"path": "/damen-kleidung/pullover/anine-bing/beige-baumwolle-anine-bing-pullover-32147447.shtml"
},
{
"language": "x-default",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "us",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "en",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "es",
"path": "/mujer-ropa/jerseis-chalecos/anine-bing/jerseis-chalecos-anine-bing-de-algodon-beige-32147447.shtml"
},
{
"language": "fr",
"path": "/vetements-femme/pulls-gilets/anine-bing/pullgilet-anine-bing-en-coton-beige-32147447.shtml"
},
{
"language": "it",
"path": "/donna-abbigliamento/maglioni-gilet/anine-bing/maglioni-gilet-anine-bing-beige-cotone-32147447.shtml"
}
],
"shouldBeGone": false,
"indexation": {
"index": true,
"follow": true,
"crawlPagination": false
},
"buyerFees": [
{
"rateType": "FLAT",
"value": 2500,
"description": "",
"cost": {
"currency": "CAD",
"cents": 2500,
"formatted": "CDN$25"
}
}
],
"dutyAndTax": {
"currency": "CAD",
"cents": 0,
"formatted": "CDN$0"
},
"flags": [
"direct-shipping"
]
}
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Setup
To scrape this target we'll need a few Python packages commonly used in web scraping. Since we'll be using the hidden web data scraping approach all we need is two packages:
- httpx - powerful HTTP client which we'll be using to retrieve the HTML pages.
- parsel - HTML parser which we'll be using to extract hidden JSON datasets.
These packages can be installed using Python's pip
console command:
$ pip install httpx parsel
For Scrapfly users there's also a Scrapfly SDK version of each code example. The SDK can be installed using pip
as well:
$ pip install "scrapfly-sdk[all]"
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Scrape Product Data
Let's start by taking a look at a single product page and how can we scrape it using Python. For example, let's take this product page:
/beige-cotton-anine-bing-knitwear-32147447.shtml
We could parse the page HTML using CSS selectors or XPath but since Verstiaire Collective is using Next.js javascript framework we can extract the dataset directly from the page source:
We can find this by inspecting the page source and looking for unique product idenfier like name or id (ctrl+f). In the example above we can see it's under <script id="__NEXT_DATA">
html element.
This is called hidden web data scraping and it's a really simple and effective way to scrape data from websites that use javascript frameworks like next.js. To scrape it all we have to do:
- Retrieve the product HTML page.
- Find the hidden JSON dataset using CSS selectors and
parsel
. - Load JSON as Python dictionary using
json.loads
. - Select the product fields.
In practical Python this would look something like this:
Python
ScrapFly
import asyncio
import json
import httpx
from parsel import Selector
# create HTTP client with defaults headers that look like a web browser and enable HTTP2 version
client = httpx.AsyncClient(
follow_redirects=True,
http2=True,
headers={
"User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
)
def find_hidden_data(html) -> dict:
"""extract hidden web cache from page html"""
# use CSS selectors to find script tag with data
data = Selector(html).css("script# __NEXT_DATA__ ::text").get()
return json.loads(data)
async def scrape_product(url: str):
# retrieve page HTML
response = await client.get(url)
# find hidden web data
data = find_hidden_data(response.text)
# extract only product data from the page dataset
product = data['props']['pageProps']['product']
return product
# example scrape run:
print(asyncio.run(scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml")))
import asyncio
import json
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY", max_concurrency=10)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script# __NEXT_DATA__ ::text").get()
data = json.loads(data)
return data
async def scrape_product(url: str) -> dict:
"""scrape a single stockx product page for product data"""
result = await scrapfly.async_scrape(ScrapeConfig(
url=url,
cache=True, # use cache while developing to speed up scraping for repeated script runs
asp=True, # Anti-Scraping Protection bypass allows to scrape protected pages
)
)
data = find_hidden_data(result)
product = data["props"]["pageProps"]["product"]
return product
# example run of 1 product scrape
print(asyncio.run(scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"))
In just a few lines of Python code, we extracted the whole product dataset which includes all of the product details and seller information!
Next up, let's take a look at how to find product listings using Vestiaire Collective sitemaps.
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Finding Products
Vestiaire Collective has an extensive sitemap suite that can be used to find all of the product listings. So, to find product pages we'll be scraping sitemaps.
Vestiaire Collective sitemaps is available at:
/sitemaps/https_sitemap-en.xml
Which contains sitemaps in split into various categories like by brand, new listings, item type (clothing, shoes):
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<!-- sitemap url and category clues, this one is for brands -->
<loc>https://www.vestiairecollective.com/sitemaps/https_en-brands-1.xml</loc>
<!-- when the sitemap was updated -->
<lastmod>2023-04-07</lastmod>
</sitemap>
<sitemap>
<loc>https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml</loc>
<lastmod>2023-04-07</lastmod>
</sitemap>
...
</sitemapindex>
Each of these sitemaps contains 50 000 product listings.
For our example, let's scrape the newest listings which can be found on the new_items.xml
sitemaps.
The new_items-1.xml
sitemap contains the newest 50_000 items. Let's see how to scrape it:
Python
ScrapFly
import asyncio
import json
from typing import Dict, List
import httpx
from parsel import Selector
client = httpx.AsyncClient(
follow_redirects=True,
http2=True,
headers={
"User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
)
def find_hidden_data(html) -> dict:
"""extract hidden web cache from page html"""
# use CSS selectors to find script tag with data
data = Selector(html).css("script# __NEXT_DATA__ ::text").get()
return json.loads(data)
async def scrape_product(url: str):
# retrieve page HTML
response = await client.get(url)
# catch products that are no longer available as they redirect to 308
for redirect in response.history:
if redirect.status_code == 308:
print(f"product {redirect.url} is no longer available")
return None
# find hidden web data
data = find_hidden_data(response.text)
# extract only product data from the page dataset
product = data["props"]["pageProps"]["product"]
return product
async def scrape_sitemap(url: str, max_pages: int = 100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
# retrieve sitemap
print(f"scraping sitemap page: {url}")
response_sitemap = await client.get(url)
product_urls = Selector(response_sitemap.text).css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
# scrape products concurrently using asyncio
product_scrapes = [asyncio.create_task(scrape_product(url)) for url in product_urls[:max_pages]]
return await asyncio.gather(*product_scrapes)
# example scrape run:
print(asyncio.run(scrape_sitemap("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=5)))
import asyncio
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY", max_concurrency=10)
async def scrape_sitemap(url: str, max_pages:int=100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
print(f"scraping sitemap page: {url}")
result_sitemap = await scrapfly.async_scrape(ScrapeConfig(url=url, asp=True))
product_urls = result_sitemap.selector.css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
product_pages = [ScrapeConfig(url=url, asp=True) for url in product_urls[:max_pages]]
products = []
async for result in scrapfly.concurrent_scrape(product_pages):
# Vestiaire Collective redirects to product category if product is no longer available (sold, deleted etc.)
if any(redirect['http_code'] == 308 for redirect in result.context['redirects']):
print(f"Product page {result.scrape_config.url} is no longer available")
continue
data = find_hidden_data(result)
products.append(data['props']['pageProps']['product'])
return products
# example scrape: scrape the first 10 newest listings
asyncio.run(scrape_sitemaps("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=10))
Above, we've used simple XML parsing using parsel
to extract URLs from the new listings sitemap. Then we scrape hidden web data of each product like we've done in the previous chapter.
Sitemaps are great for finding scrape targets quickly and efficiently. Though to further scale our scraper up let's take a look at how to avoid blocking using Scrapfly SDK.
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Avoiding Blocking with ScrapFly
Vestiaire Collective can be difficult to scrape at scale as it block web scraping using Cloudflare anti-scraping service. So, to scale up our scrapers, we'll need to use proxies or other tools to avoid scraper blocking or Scrapfly API.
Scrapfly service does the heavy lifting for you!
Scrapfly API is a perfect tool for scaling up web scrapers and avoiding being blocked. It's a drop-in replacement for the tools we used in this guide and comes with scraper power-up features like:
- Millions of Residential Proxies
- Anti Scraping Protection bypass
- Javascript rendering and headless cloud browsers
- Web dashboard for monitoring and managing scrapers
All these tools can be easily accessed through Python SDK:
from scrapfly import ScrapeConfig, ScrapflyClient
client = ScrapflyClient(key="YOUR SCRAPFLY KEY")
result = client.scrape(ScrapeConfig(
url="https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml",
# enable scraper blocking service bypass
asp=True
# optional - render javascript using headless browsers:
render_js=True,
))
print(result.content)
For more on web scraping Vestiaire Collective with ScrapFly check out the Full Scraper Code section.
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
FAQ
To wrap up our guide on how to scrape Vestiaire Collective, let's take a look at some frequently asked questions.
Is it legal to scrape Vestiaire Collective?
Yes. All of the data we scraped in this tutorial is available publically which is perfectly legal to scrape. However, attention should be paid when using scraped seller data as it can be protected by GDPR or copyright in Europe.
Can Vestiaire Collective be crawled?
Yes. Crawling is a form of web scraping where the scraper discovers product listing on it's own and Visetiaire Collective offers many discovery points such as recommendations, search and sitemaps.
Summary
In this quick tutorial, we took a look at how to scrape Vestiaire Collective using Python. We covered how to use the hidden web data scraping approach to quickly extract product datasets from HTML pages. To find the products we've covered how to use sitemaps to quickly collect all of the product listings by category.
To avoid blocking we've taken a look at Scrapfly API scaling solution which can be used to scale your scraping projects to collect public datasets like this one in a matter of minutes!
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Get Your FREE API KeyDiscover ScrapFly
<!--kg-card-end: markdown--><!--kg-card-begin: markdown-->
Full Scraper Code
Here's the full Vestiaire Collective product scraper using Python and Scrapfly Python SDK:
💙 This code should only be used as a reference. To scrape data from Vestiaire Collective at scale you'll need to adjust it to your preferences and environment
import asyncio
import os
import json
from pathlib import Path
from typing import Dict, List
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"], max_concurrency=10)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script# __NEXT_DATA__ ::text").get()
data = json.loads(data)
return data
async def scrape_product(url: str) -> dict:
"""scrape a single stockx product page for product data"""
result = await scrapfly.async_scrape(
ScrapeConfig(
url=url,
cache=True,
asp=True,
)
)
data = find_hidden_data(result)
product = data["props"]["pageProps"]["product"]
return product
async def scrape_sitemap(url: str, max_pages: int = 100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
print(f"scraping sitemap page: {url}")
result_sitemap = await scrapfly.async_scrape(ScrapeConfig(url=url, asp=True))
product_urls = result_sitemap.selector.css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
product_pages = [ScrapeConfig(url=url, asp=True) for url in product_urls[:max_pages]]
products = []
async for result in scrapfly.concurrent_scrape(product_pages):
# Vestiaire Collective redirects to product category if product is no longer available (sold, deleted etc.)
if any(redirect["http_code"] == 308 for redirect in result.context["redirects"]):
print(f"Product page {result.scrape_config.url} is no longer available")
continue
data = find_hidden_data(result)
products.append(data["props"]["pageProps"]["product"])
return products
async def example_run():
"""
this example run will scrape example product and sitemap for 5 newest items
save them to ./results/product.json and ./results/sitemap.json respectively
"""
out_dir = Path( __file__ ).parent / "results"
out_dir.mkdir(exist_ok=True)
product = await scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml")
out_dir.joinpath("product.json").write_text(json.dumps(product, indent=2, ensure_ascii=False))
search = await scrape_sitemap("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=5)
out_dir.joinpath("sitemap.json").write_text(json.dumps(search, indent=2, ensure_ascii=False))
if __name__ == " __main__":
asyncio.run(example_run())
<!--kg-card-end: markdown--><!--kg-card-begin: html-->{<br> "@context": "<a href="https://schema.org">https://schema.org</a>",<br> "@type": "FAQPage",<br> "mainEntity": [<br> {<br> "@type": "Question",<br> "name": "Is it legal to scrape Vestiaire Collective?",<br> "acceptedAnswer": {<br> "@type": "Answer",<br> "text": "<p>Yes. All of the data we scraped in this tutorial is available publically which is perfectly legal to scrape. However, attention should be paid when using scraped seller data as it can be protected by GDPR or copyright in Europe.</p>"<br> }<br> },<br> {<br> "@type": "Question",<br> "name": "Can Vestiaire Collective be crawled?",<br> "acceptedAnswer": {<br> "@type": "Answer",<br> "text": "<p>Yes. Crawling is a form of web scraping where the scraper discovers product listing on it's own and Visetiaire Collective offers many discovery points such as recommendations, search and sitemaps.</p>"<br> }<br> }<br> ]<br> }<!--kg-card-end: html-->
Top comments (0)