Advanced Sampling Strategies¶
This guide covers sophisticated sampling techniques for complex research designs.
Stratified Sampling by Road Type¶
When you need representative samples from different road categories:
from geo_sampling import sample_roads
# Sample with specific proportions
sample_roads.sample_roads(
'delhi_roads.csv',
'delhi_stratified.csv',
sample_size=1000,
stratify_by='road_type',
strata_proportions={
'motorway': 0.1,
'primary': 0.2,
'secondary': 0.3,
'residential': 0.4
}
)
Geographic Clustering¶
For studies requiring spatial clusters:
import pandas as pd
from sklearn.cluster import KMeans
from geo_sampling import sample_roads
# Load road data
roads = pd.read_csv('roads.csv')
# Create geographic clusters
coords = roads[['latitude', 'longitude']].values
kmeans = KMeans(n_clusters=10, random_state=42)
roads['cluster'] = kmeans.fit_predict(coords)
# Sample from each cluster
samples = []
for cluster_id in range(10):
cluster_roads = roads[roads['cluster'] == cluster_id]
cluster_sample = cluster_roads.sample(n=50, random_state=42)
samples.append(cluster_sample)
final_sample = pd.concat(samples)
final_sample.to_csv('clustered_sample.csv', index=False)
Time-Based Sampling¶
For longitudinal studies with repeated measurements:
import numpy as np
# Define sampling waves
n_waves = 4
samples_per_wave = 250
total_roads = len(roads)
# Generate non-overlapping samples
all_indices = np.arange(total_roads)
np.random.shuffle(all_indices)
wave_size = total_roads // n_waves
for wave in range(n_waves):
start_idx = wave * wave_size
end_idx = start_idx + wave_size
wave_indices = all_indices[start_idx:end_idx]
# Sample from this wave's pool
wave_sample = np.random.choice(wave_indices,
size=samples_per_wave,
replace=False)
wave_roads = roads.iloc[wave_sample]
wave_roads.to_csv(f'wave_{wave+1}_sample.csv', index=False)
Weighted Sampling by Traffic Density¶
Weight samples by estimated traffic volume:
# Assign weights based on road type
road_weights = {
'motorway': 10.0,
'trunk': 8.0,
'primary': 6.0,
'secondary': 4.0,
'tertiary': 2.0,
'residential': 1.0
}
roads['weight'] = roads['road_type'].map(road_weights).fillna(0.5)
# Weighted random sampling
sample = roads.sample(
n=500,
weights='weight',
random_state=42
)
sample.to_csv('traffic_weighted_sample.csv', index=False)
Exclusion Zones¶
Exclude areas near specific points of interest:
from shapely.geometry import Point
from shapely.ops import unary_union
# Define exclusion zones (e.g., around hospitals, schools)
exclusion_points = [
(28.6139, 77.2090), # Point of interest 1
(28.5355, 77.3910), # Point of interest 2
]
# Create buffer zones (1km radius)
exclusion_zones = []
for lat, lon in exclusion_points:
point = Point(lon, lat)
buffer = point.buffer(0.01) # ~1km in degrees
exclusion_zones.append(buffer)
combined_exclusion = unary_union(exclusion_zones)
# Filter roads outside exclusion zones
valid_roads = []
for idx, row in roads.iterrows():
road_point = Point(row['longitude'], row['latitude'])
if not combined_exclusion.contains(road_point):
valid_roads.append(row)
filtered_roads = pd.DataFrame(valid_roads)
sample = filtered_roads.sample(n=500, random_state=42)
Multi-Stage Sampling¶
Hierarchical sampling approach:
# Stage 1: Sample districts
districts = roads['district'].unique()
sampled_districts = np.random.choice(districts, size=5, replace=False)
# Stage 2: Sample neighborhoods within districts
stage2_roads = roads[roads['district'].isin(sampled_districts)]
neighborhoods = stage2_roads.groupby('district')['neighborhood'].unique()
sampled_neighborhoods = []
for district in sampled_districts:
district_neighborhoods = neighborhoods[district]
sampled = np.random.choice(district_neighborhoods,
size=min(3, len(district_neighborhoods)),
replace=False)
sampled_neighborhoods.extend(sampled)
# Stage 3: Sample roads within neighborhoods
final_roads = roads[roads['neighborhood'].isin(sampled_neighborhoods)]
final_sample = final_roads.sample(n=min(500, len(final_roads)),
random_state=42)
final_sample.to_csv('multi_stage_sample.csv', index=False)
Validation and Quality Checks¶
Always validate your samples:
def validate_sample(sample_df, original_df):
"""Validate sample representativeness"""
print("Sample Validation Report")
print("=" * 40)
# Sample size
print(f"Sample size: {len(sample_df)}")
print(f"Original size: {len(original_df)}")
print(f"Sampling rate: {len(sample_df)/len(original_df):.2%}")
# Road type distribution
print("\nRoad Type Distribution:")
sample_dist = sample_df['road_type'].value_counts(normalize=True)
original_dist = original_df['road_type'].value_counts(normalize=True)
comparison = pd.DataFrame({
'Sample': sample_dist,
'Original': original_dist,
'Difference': sample_dist - original_dist
})
print(comparison)
# Geographic spread
print("\nGeographic Coverage:")
print(f"Latitude range - Sample: [{sample_df['latitude'].min():.4f}, "
f"{sample_df['latitude'].max():.4f}]")
print(f"Latitude range - Original: [{original_df['latitude'].min():.4f}, "
f"{original_df['latitude'].max():.4f}]")
print(f"Longitude range - Sample: [{sample_df['longitude'].min():.4f}, "
f"{sample_df['longitude'].max():.4f}]")
print(f"Longitude range - Original: [{original_df['longitude'].min():.4f}, "
f"{original_df['longitude'].max():.4f}]")
return comparison
# Run validation
validation_results = validate_sample(sample, roads)
Best Practices¶
Document your sampling strategy - Include rationale for choices
Set random seeds - Ensure reproducibility
Validate samples - Check representativeness
Handle edge cases - Account for missing data or small strata
Consider field logistics - Account for accessibility and safety
Plan for non-response - Sample extras for expected attrition
Archive sampling frames - Save the full dataset used for sampling