import asyncio import re from playwright.async_api import async_playwright import csv import json from datetime import datetime import random from urllib.parse import quote from django.utils import timezone from django.conf import settings EXECUTABLE_PATH=settings.EXECUTABLE_PATH class MadeInChinaScraper: def __init__(self): self.products = [] self.timeout = 45000 self.retry_count = 3 self.min_delay = 2 self.max_delay = 5 async def scrape_product_details(self, page, url): try: print(f"[INFO] Loading product details page: {url}") try: await page.goto(url, timeout=self.timeout, wait_until="domcontentloaded") await page.wait_for_selector('.detail-tab-item, .basic-info-list', timeout=15000) except Exception as e: print(f"[WARN] Page load failed for {url}: {str(e)[:100]}...") return None details = { 'basic_info': {}, 'description': "", 'images': [], 'categories':"", 'price_ranges':"", 'Rating': "", 'Features':{}, 'Sample Info':{}, } # Scrape basic info if available # Scrape category breadcrumb try: cat = [] # Get the

element that holds the breadcrumb category_breadcrum = await page.query_selector('.sr-QPWords-item') if category_breadcrum: # Get all tags inside the breadcrumb category_a_tags = await category_breadcrum.query_selector_all('a') for item in category_a_tags: try: category_text = (await item.text_content()).strip() if category_text: cat.append(category_text) except: continue # Store breadcrumb joined with " > " if cat: if cat and cat[0].lower().strip() == "home": cat.pop(0) details['categories'] = " > ".join(cat) except Exception as e: print(f"[WARN] Couldn't scrape category from Breadcrumb: {e}") try: details['price_ranges'] = {} # ---- Case 1: single price structure ---- one_price_rows = await page.query_selector_all(".only-one-priceNum-tr") if one_price_rows: print("[INFO] Found single price range") for row in one_price_rows: price_el = await row.query_selector('.only-one-priceNum-td-left') moq_el = await row.query_selector('.sa-only-property-price') price = (await price_el.inner_text()).strip() if price_el else None moq = (await moq_el.inner_text()).strip() if moq_el else None if price and moq: # clean moq like "1 Set (MOQ)" → "1 Set" moq = re.sub(r'\(.*?\)', '', moq).strip() details['price_ranges'][moq] = price # ---- Case 2: multiple ranges ---- else: multiple_price_rows = await page.query_selector_all(".swiper-slide-div") if multiple_price_rows: for row in multiple_price_rows: price_el = await row.query_selector('.swiper-money-container') moq_el = await row.query_selector('.swiper-unit-container') price = (await price_el.inner_text()).strip() if price_el else None moq = (await moq_el.inner_text()).strip() if moq_el else None if price and moq: # clean "50+ Pieces" stays the same, but we strip whitespace moq = re.sub(r'\(.*?\)', '', moq).strip() details['price_ranges'][moq] = price except Exception as e: print(f"[WARN] Couldn't scrape price ranges: {e}") try: rating_el = await page.query_selector('.review-score') if rating_el: rating_text = (await rating_el.inner_text()).strip() details['Rating'] = rating_text print(f"[INFO] Rating found: {rating_text}") else: details['Rating'] = "N/A" print("[INFO] No rating element found") except Exception as e: print(f"[WARN] Couldn't scrape rating: {e}") try: sample_order = await page.query_selector('.J-sample-order .sample-price') if sample_order: price_text = (await sample_order.inner_text()).strip() details["Sample Info"] = price_text else: details["Sample Info"] = None except Exception as e: print(f"[WARN] Couldn't scrape sample product price: {e}") try: basic_info_items = await page.query_selector_all('.bsc-item') for item in basic_info_items: try: label_element = await item.query_selector('.bac-item-label') value_element = await item.query_selector('.bac-item-value') if label_element and value_element: label = (await label_element.inner_text()).strip().replace(':', '') value = (await value_element.inner_text()).strip() details['basic_info'][label] = value else: # If one of them is missing, store None if label_element: label = (await label_element.inner_text()).strip().replace(':', '') details['basic_info'][label] = None except Exception as e: print(f"[WARN] Failed parsing item: {e}") continue except Exception as e: print(f"[WARN] Couldn't scrape basic info: {e}") # Scrape description if available try: desc_element = await page.query_selector('.detail-desc .rich-text') if desc_element: details['description'] = (await desc_element.inner_text()).strip() except: print("[WARN] Couldn't scrape description") # Scrape images if available try: details['images'] = [] # Select all slide items slide_items = await page.query_selector_all('.sr-proMainInfo-slide-picItem') for slide in slide_items: # 1) Check inside slide img_tag = await slide.query_selector('img.J-picImg-zoom-in') if img_tag: src = await img_tag.get_attribute('src') else: # 2) fallback: use fsrc attribute on the slide itself src = await slide.get_attribute('fsrc') if src: # Clean protocol-relative URLs if src.startswith("//"): src = "https:" + src elif not src.startswith(("http:", "https:")): src = "https://" + src.lstrip("/") # Skip .mp4.webp thumbnails (video previews) if not src.lower().endswith(".mp4.webp"): details['images'].append(src) print(f"[INFO] Collected {len(details['images'])} images") except Exception as e: print(f"[WARN] Couldn't scrape images: {e}") return details except Exception as e: print(f"[ERROR] Error scraping product details: {str(e)[:100]}...") return None async def scrape_page(self, page, url): for attempt in range(self.retry_count): try: print(f"[INFO] Loading page (attempt {attempt + 1}): {url}") await page.goto(url, timeout=self.timeout, wait_until="domcontentloaded") # Wait for either products or "no results" message await page.wait_for_selector( '.prod-content, .no-result-content', timeout=15000 ) # Check for no results no_results = await page.query_selector('.no-result-content') if no_results: print("[INFO] No results found on this page") return False return True except Exception as e: print(f"[WARN] Attempt {attempt + 1} failed: {str(e)[:100]}...") if attempt < self.retry_count - 1: delay = random.uniform(3, 8) print(f"[INFO] Retrying in {delay:.1f} seconds...") await asyncio.sleep(delay) else: print("[ERROR] Max retries reached for this page") return False def build_filter_string(self, filters: dict) -> str: filter_parts = [] # Map for business type business_type_map = { "Manufacturer/ factory": 1, "Trading Company": 2, "Service Provider": 3, "Group Corporation": 4, "Retailer": 5, "Trade Agent": 6, "Buying Office": 7, "Other": 8, "Wholesaler": 9, "Government Institution": 10, "Individuals/SOHO": 11 } # Map for R&D type rd_type_map = { "OEM": 4, "ODM": 5, "Own Brand": 6, "Others": 99 } if filters.get("secured_trading"): filter_parts.append("DP_1") if filters.get("buy_sample"): filter_parts.append("ODS_1") if filters.get("min_order"): filter_parts.append(f"Min_{filters['min_order']}") if filters.get("search_within"): term = filters["search_within"] filter_parts.append(f"PV_9999_{term}_999999999") if filters.get("audited_suppliers"): filter_parts.append("SGS_AS--CL_DGM") if filters.get("business_type"): bt_value = business_type_map.get(filters["business_type"]) if bt_value: filter_parts.append(f"BT_{bt_value}") if filters.get("rd_type"): rd_value = rd_type_map.get(filters["rd_type"]) if rd_value: filter_parts.append(f"RD_{rd_value}") # Join with -- if filter_parts: return "F1--" + "--".join(filter_parts) return "F1" async def scrape_made_in_china(self, search_term, start_page=1, num_products=None, skip_in_page=0,filters=None): async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=[ "--disable-blink-features=AutomationControlled", "--start-maximized" ], slow_mo=100, executable_path=EXECUTABLE_PATH ) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/115.0.0.0 Safari/537.36", viewport={"width": 1366, "height": 768}, locale="en-US" ) page = await context.new_page() encoded_term = quote(search_term.replace(' ', '+')) filter_string = self.build_filter_string(filters or {}) collected = 0 current_page = start_page seen_count = 0 while True: url = f"https://www.made-in-china.com/multi-search/{encoded_term}/{filter_string}/{current_page}.html" success = await self.scrape_page(page, url) if not success: print(f"[INFO] Stopping, no results on page {current_page}") break # product_nodes = await page.query_selector_all('.list-item, .list-node, .prod-item') product_nodes = await page.query_selector_all('.prod-content') print(f"[INFO] Found {len(product_nodes)} products on page {current_page}") # for i, product in enumerate(product_nodes): # # apply skip logic # if skip_in_page > 0: # skip_in_page -= 1 # continue for product in product_nodes: if seen_count < skip_in_page: seen_count += 1 continue if num_products and collected >= num_products: print(f"[INFO] Collected enough products ({collected}), stopping...") await context.close() await browser.close() return self.products try: success = await self.scrape_single_product(page, product, current_page) if success: # ✅ only count real products collected += 1 except Exception as e: print(f"[ERROR] Failed to scrape product on page {current_page}: {str(e)[:100]}") # Stop if not enough products and next page required current_page += 1 delay = random.uniform(self.min_delay, self.max_delay) await asyncio.sleep(delay) await context.close() await browser.close() return self.products async def scrape_single_product(self, page, product, page_num): product_html = await product.evaluate('element => element.outerHTML') product_data = { 'name': "N/A", 'link': "N/A", 'price': "N/A", 'moq': "N/A", 'company': "N/A", 'properties': {}, 'scraped_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'page_num': page_num, 'type': "N/A", 'sku':"N/A", 'variant':"N/A", 'Sample Info':"", "Rating":"" } # Name & Link name_element = await product.query_selector('h2.product-name a:not(.activity-flag-img), .prod-name a:not(.activity-flag-img)') if name_element: product_data['name'] = (await name_element.inner_text()).strip() product_data['link'] = await name_element.get_attribute('href') or "N/A" if product_data['link'].startswith("https"): product_data['link']=product_data['link'] elif product_data['link'].startswith("//"): product_data['link'] = "https:" + product_data['link'] elif not product_data['link'].startswith(("http:", "https:")): product_data['link'] = "https://www.made-in-china.com" + product_data['link'] if ( product_data['link'] == "N/A" or "made-in-china.com" not in product_data['link'] or product_data['link'].endswith(".en.made-in-china.com") # company page only ): print(f"[INFO] Skipping non-product entry on page {page_num}") return False # <-- Do not append or count this product # Price try: price_element = await product.query_selector('.price-info .price, .price-val') if price_element: product_data['price'] = (await price_element.inner_text()).strip() except: print("[WARN] Couldn't get price") # MOQ try: moq_element = await product.query_selector('.product-property .info:nth-of-type(2)') if moq_element: full_text = (await moq_element.inner_text()).strip() product_data['moq'] = re.sub(r'\(.*?\)', '', full_text).replace("MOQ", "").strip() except: print("[WARN] Couldn't get MOQ") # Company try: company_element = await product.query_selector('.company-name-wrapper .company-name-txt a span') if not company_element: company_element = await product.query_selector('.company-name-wrapper span') product_data['company'] = (await company_element.inner_text()).strip() if company_element else "N/A" except: print("[WARN] Couldn't get company") # Properties try: prop_elements = await product.query_selector_all('.property-list li') for prop in prop_elements: try: # Label (text before the colon) full_text = (await prop.inner_text()).strip() label_text = full_text.split(":")[0].strip() # Value (inside span.property-val) value_element = await prop.query_selector('.property-val') value_text = (await value_element.inner_text()).strip() if value_element else "" product_data["properties"][label_text] = value_text except: continue except: print("[WARN] Couldn't get properties") # Product details page if product_data['link'] != "N/A": context = page.context details_page = await context.new_page() product_details = await self.scrape_product_details(details_page, product_data['link']) match = re.search(r'/product/([^/]+)/', product_data['link']) if match: product_data['sku'] = match.group(1) if product_details: product_data.update({ 'basic_info': product_details.get('basic_info', {}), 'description': product_details.get('description', ''), 'images': product_details.get('images', []), 'categories':product_details.get('categories',""), 'price ranges':product_details.get('price_ranges',{}), 'Rating':product_details.get('Rating','N/A') }) await details_page.close() product_data['type']="inquiry" product_data['variant']="N/A" self.products.append(product_data) return True async def save_to_csv(products, filename): if not products: print("[WARN] No products to save") return # Include all possible fields in CSV fieldnames = ['name', 'link', 'price', 'moq', 'company', 'page_num', 'scraped_at'] if any('properties' in p for p in products): fieldnames.append('properties') # Add detail fields if they exist in any product if any('basic_info' in p for p in products): fieldnames.append('basic_info') if any('description' in p for p in products): fieldnames.append('description') if any('images' in p for p in products): fieldnames.append('images') if any('categories' in p for p in products): fieldnames.append('categories') if any('price ranges' in p for p in products): fieldnames.append('price ranges') with open(filename, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for product in products: row = {k: product.get(k, '') for k in fieldnames} # Convert dicts/lists to strings for CSV for k, v in row.items(): if isinstance(v, (dict, list)): row[k] = json.dumps(v, ensure_ascii=False) writer.writerow(row) print(f"[INFO] Saved {len(products)} products to {filename}")