Working with Stata Files in Pandas

Stata is a popular statistical software package widely used in economics, social sciences, and research. If you're transitioning from Stata to Python or need to work with Stata data files (.dta), pandas provides excellent support for reading, writing, and manipulating Stata datasets while preserving important metadata like variable labels and value labels.

Why Work with Stata Files in Pandas?

Data preservation: Maintain variable labels, value labels, and data types
Seamless transition: Move from Stata workflows to Python analysis
Enhanced capabilities: Leverage pandas' powerful data manipulation alongside Stata's data structure
Collaboration: Work with colleagues who use Stata while using Python tools
Large datasets: Handle datasets that might be too large for Stata's memory limitations

Installing Required Packages

# Basic pandas installation
pip install pandas

# For enhanced Stata support
pip install pyreadstat

# Alternative for older pandas versions
pip install pandas[stata]

Reading Stata Files

Basic Read Operation

import pandas as pd

# Simple read - most common usage
df = pd.read_stata('dataset.dta')
print(df.head())
print(f"Dataset shape: {df.shape}")

Advanced Reading Options

# Read with all metadata preserved
df = pd.read_stata('dataset.dta', 
                   convert_dates=True,           # Convert Stata dates to pandas datetime
                   convert_categoricals=True,    # Convert labeled variables to categories
                   preserve_dtypes=True,         # Keep original Stata data types
                   convert_missing=True)         # Handle Stata missing value codes

# Read specific columns only
df = pd.read_stata('dataset.dta', columns=['income', 'age', 'education'])

# Read with iterator for large files
chunk_iter = pd.read_stata('large_dataset.dta', chunksize=10000)
for chunk in chunk_iter:
    # Process each chunk
    print(f"Processing chunk with {len(chunk)} rows")
    # Your analysis here
    break  # Just show first chunk for example

Handling Stata Metadata

# Read file and access metadata
df = pd.read_stata('survey_data.dta')

# Access variable labels (if preserved)
if hasattr(df, 'variable_labels'):
    print("Variable labels:")
    for var, label in df.variable_labels.items():
        print(f"{var}: {label}")

# Access value labels for categorical variables
for column in df.select_dtypes(include=['category']):
    print(f"\nValue labels for {column}:")
    print(df[column].cat.categories)

Understanding Stata Data Types in Pandas

Stata to Pandas Type Mapping

import pandas as pd

# Example dataset showing type conversions
df = pd.read_stata('example.dta')

print("Data types after reading:")
print(df.dtypes)
print("\nDetailed info:")
print(df.info())

# Common Stata type conversions:
# byte, int, long -> int64
# float, double -> float64  
# str# -> object (string)
# Labeled values -> category (if convert_categoricals=True)

Working with Stata Dates

# Stata stores dates differently than pandas
df = pd.read_stata('dataset_with_dates.dta', convert_dates=True)

# Check date columns
date_columns = df.select_dtypes(include=['datetime64']).columns
print(f"Date columns: {list(date_columns)}")

# Manual date conversion if needed
df['survey_date'] = pd.to_datetime(df['stata_date'], origin='1960-01-01', unit='D')

# Extract date components (common in Stata workflows)
df['year'] = df['survey_date'].dt.year
df['month'] = df['survey_date'].dt.month
df['quarter'] = df['survey_date'].dt.quarter

Handling Missing Values

# Stata uses different missing value codes (., .a, .b, etc.)
df = pd.read_stata('dataset.dta', convert_missing=True)

# Check missing values
print("Missing values by column:")
print(df.isnull().sum())

# Stata missing value patterns
missing_pattern = df.isnull()
print("\nRows with any missing values:", missing_pattern.any(axis=1).sum())
print("Complete cases:", (~missing_pattern.any(axis=1)).sum())

# Handle missing values (common Stata operations)
# Drop missing observations
df_complete = df.dropna()

# Fill missing with mean (for numeric variables)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

Common Stata Operations in Pandas

Variable Creation and Modification

# Generate new variables (Stata: gen)
df['log_income'] = np.log(df['income'])
df['income_squared'] = df['income'] ** 2

# Conditional variable creation (Stata: gen var = ... if condition)
df['high_income'] = (df['income'] > df['income'].median()).astype(int)

# Replace values (Stata: replace)
df.loc[df['age'] < 0, 'age'] = np.nan  # Replace negative ages with missing

# Recode variables (Stata: recode)
df['age_group'] = pd.cut(df['age'], 
                        bins=[0, 25, 35, 50, 65, 100],
                        labels=['18-25', '26-35', '36-50', '51-65', '65+'])

# Create dummy variables (Stata: tab var, gen())
education_dummies = pd.get_dummies(df['education'], prefix='edu')
df = pd.concat([df, education_dummies], axis=1)

Data Summary and Description

# Describe data (Stata: describe)
print("Dataset description:")
print(df.info())
print(f"\nObservations: {len(df)}")
print(f"Variables: {len(df.columns)}")

# Summary statistics (Stata: summarize)
print("\nSummary statistics:")
print(df.describe())

# Detailed summary for specific variables
print("\nIncome summary:")
print(df['income'].describe())

# Stata-style summary with additional percentiles
income_summary = df['income'].describe(percentiles=[.01, .05, .10, .25, .50, .75, .90, .95, .99])
print(income_summary)

Frequency Tables and Cross-tabulations

# Frequency tables (Stata: tab varname)
print("Education frequency:")
print(df['education'].value_counts().sort_index())

# With percentages
print("\nEducation percentages:")
print(df['education'].value_counts(normalize=True).sort_index() * 100)

# Two-way tabulation (Stata: tab var1 var2)
print("\nEducation by Gender:")
crosstab = pd.crosstab(df['education'], df['gender'])
print(crosstab)

# With row percentages
print("\nRow percentages:")
print(pd.crosstab(df['education'], df['gender'], normalize='index') * 100)

# With chi-square test
from scipy.stats import chi2_contingency
chi2, p_value, dof, expected = chi2_contingency(crosstab)
print(f"\nChi-square test: χ² = {chi2:.4f}, p-value = {p_value:.4f}")

Sorting and Ordering

# Sort data (Stata: sort varlist)
df_sorted = df.sort_values('income')

# Multiple sort keys
df_sorted = df.sort_values(['education', 'income'], ascending=[True, False])

# Generate sort index
df['rank_income'] = df['income'].rank(method='min', ascending=False)

Advanced Stata File Operations

Working with Large Stata Files

def process_large_stata_file(filename, chunksize=10000):
    """
    Process large Stata files in chunks to manage memory
    """
    chunk_results = []
    
    # Read file in chunks
    for chunk_num, chunk in enumerate(pd.read_stata(filename, chunksize=chunksize)):
        print(f"Processing chunk {chunk_num + 1} with {len(chunk)} observations")
        
        # Perform analysis on chunk
        chunk_summary = {
            'chunk': chunk_num + 1,
            'n_obs': len(chunk),
            'mean_income': chunk['income'].mean(),
            'median_income': chunk['income'].median()
        }
        chunk_results.append(chunk_summary)
    
    # Combine results
    results_df = pd.DataFrame(chunk_results)
    return results_df

# Example usage
# results = process_large_stata_file('large_survey.dta', chunksize=5000)

Preserving Stata Attributes

# Read with metadata preservation
df = pd.read_stata('survey.dta')

# Store original metadata before manipulation
original_dtypes = df.dtypes.to_dict()
original_columns = df.columns.tolist()

# Example: Preserve variable labels when available
variable_labels = {}
if hasattr(df, 'variable_labels'):
    variable_labels = df.variable_labels.copy()

# After data manipulation, you can reference original structure
print("Original variable order:", original_columns)
print("Original data types:", original_dtypes)

Converting Between Long and Wide Format

# Wide to long (Stata: reshape long)
# Assume we have variables: income2019, income2020, income2021
wide_df = pd.DataFrame({
    'person_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'income2019': [50000, 55000, 60000],
    'income2020': [52000, 57000, 62000],
    'income2021': [54000, 59000, 64000]
})

# Reshape to long format
long_df = pd.melt(wide_df, 
                 id_vars=['person_id', 'name'],
                 value_vars=['income2019', 'income2020', 'income2021'],
                 var_name='year_var', 
                 value_name='income')

# Clean up year variable
long_df['year'] = long_df['year_var'].str.extract('(\d+)').astype(int)
long_df = long_df.drop('year_var', axis=1)
print("Long format:")
print(long_df)

# Long to wide (Stata: reshape wide)
wide_again = long_df.pivot(index=['person_id', 'name'], 
                          columns='year', 
                          values='income').reset_index()
wide_again.columns.name = None  # Remove column name from pivot
print("\nBack to wide format:")
print(wide_again)

Writing Data Back to Stata Format

Basic Write Operation

# Save DataFrame as Stata file
df.to_stata('output_data.dta', write_index=False)

# With additional options
df.to_stata('output_data.dta',
           write_index=False,
           convert_dates={'date_column': 'td'},  # Specify date format
           variable_labels={'income': 'Annual Income in USD',
                           'age': 'Age in years'},
           version=117)  # Stata version (default is 117)

Advanced Write Options

# Prepare data for Stata output
df_for_stata = df.copy()

# Handle long variable names (Stata has 32 character limit for older versions)
def shorten_varnames(df, max_length=32):
    """Shorten variable names for Stata compatibility"""
    new_names = {}
    for col in df.columns:
        if len(col) > max_length:
            new_names[col] = col[:max_length]
    return df.rename(columns=new_names)

df_for_stata = shorten_varnames(df_for_stata)

# Convert problematic data types
# Convert boolean to numeric
bool_cols = df_for_stata.select_dtypes(include=['bool']).columns
df_for_stata[bool_cols] = df_for_stata[bool_cols].astype(int)

# Ensure string variables aren't too long
str_cols = df_for_stata.select_dtypes(include=['object']).columns
for col in str_cols:
    max_len = df_for_stata[col].astype(str).str.len().max()
    print(f"Column {col}: max string length = {max_len}")

# Write with error handling
try:
    df_for_stata.to_stata('clean_output.dta', write_index=False)
    print("Successfully wrote Stata file")
except Exception as e:
    print(f"Error writing Stata file: {e}")

Practical Example: Economic Survey Analysis

import pandas as pd
import numpy as np
from scipy import stats

# Load economic survey data
df = pd.read_stata('economic_survey.dta', convert_categoricals=True)

print("=== ECONOMIC SURVEY ANALYSIS ===")
print(f"Sample size: {len(df)} observations")
print(f"Variables: {len(df.columns)}")

# 1. Data cleaning
print("\n1. DATA CLEANING")
print("Missing values by variable:")
missing_summary = df.isnull().sum()
print(missing_summary[missing_summary > 0])

# Handle missing income data
df = df.dropna(subset=['income'])  # Drop obs with missing income
print(f"Sample after dropping missing income: {len(df)}")

# 2. Descriptive statistics
print("\n2. DESCRIPTIVE STATISTICS")
print("Income summary:")
income_stats = df['income'].describe()
print(income_stats)

# Income by education level
print("\nIncome by education level:")
education_income = df.groupby('education_level')['income'].agg(['count', 'mean', 'std', 'median'])
print(education_income)

# 3. Create analytical variables
print("\n3. VARIABLE CREATION")
# Log income for regression analysis
df['log_income'] = np.log(df['income'])

# Education categories
df['college_grad'] = (df['education_level'] == 'College+').astype(int)

# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], 
                        labels=['Under 30', '30-45', '46-60', 'Over 60'])

# 4. Statistical analysis
print("\n4. STATISTICAL ANALYSIS")

# T-test: Income difference by gender
male_income = df[df['gender'] == 'Male']['income']
female_income = df[df['gender'] == 'Female']['income']
t_stat, p_value = stats.ttest_ind(male_income, female_income)
print(f"Gender income gap t-test: t = {t_stat:.3f}, p = {p_value:.3f}")

# Correlation analysis
numeric_vars = ['income', 'age', 'years_experience']
correlation_matrix = df[numeric_vars].corr()
print("\nCorrelation matrix:")
print(correlation_matrix)

# 5. Save processed data
print("\n5. SAVING RESULTS")

# Create summary dataset
summary_stats = df.groupby(['education_level', 'gender']).agg({
    'income': ['count', 'mean', 'std'],
    'age': 'mean'
}).round(2)

# Flatten column names
summary_stats.columns = ['_'.join(col).strip() for col in summary_stats.columns.values]
summary_stats = summary_stats.reset_index()

# Save both detailed and summary data
df.to_stata('processed_survey.dta', write_index=False,
           variable_labels={
               'log_income': 'Log of annual income',
               'college_grad': 'College graduate dummy (1=yes)',
               'age_group': 'Age group categories'
           })

summary_stats.to_stata('income_summary.dta', write_index=False)
print("Saved processed data and summary statistics to Stata files")

Best Practices for Stata-Pandas Workflow

1. Data Validation

def validate_stata_data(df, expected_vars=None):
    """
    Validate Stata data after reading
    """
    print("Data validation report:")
    print(f"Observations: {len(df)}")
    print(f"Variables: {len(df.columns)}")
    
    if expected_vars:
        missing_vars = set(expected_vars) - set(df.columns)
        if missing_vars:
            print(f"WARNING: Missing expected variables: {missing_vars}")
    
    # Check for duplicate observations
    if df.duplicated().any():
        print(f"WARNING: Found {df.duplicated().sum()} duplicate rows")
    
    # Check data types
    print("\nData types:")
    print(df.dtypes.value_counts())
    
    return df

# Example usage
df = pd.read_stata('survey.dta')
df = validate_stata_data(df, expected_vars=['income', 'age', 'education'])

2. Memory Management

def optimize_stata_dataframe(df):
    """
    Optimize memory usage for large Stata datasets
    """
    original_memory = df.memory_usage(deep=True).sum() / 1024**2
    
    # Convert object types to category where appropriate
    for col in df.select_dtypes(include=['object']):
        if df[col].nunique() / len(df) < 0.5:  # If less than 50% unique values
            df[col] = df[col].astype('category')
    
    # Downcast numeric types
    for col in df.select_dtypes(include=['int64']):
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float64']):
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    new_memory = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage reduced from {original_memory:.1f} MB to {new_memory:.1f} MB")
    print(f"Reduction: {(1 - new_memory/original_memory)*100:.1f}%")
    
    return df

3. Reproducible Analysis

# Create analysis script template
analysis_template = """
# Stata to Pandas Analysis Template
import pandas as pd
import numpy as np
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)

# Analysis parameters
DATA_FILE = 'input_data.dta'
OUTPUT_PREFIX = f'analysis_{datetime.now().strftime("%Y%m%d")}'

print(f"Starting analysis: {datetime.now()}")
print(f"Data file: {DATA_FILE}")
print(f"Output prefix: {OUTPUT_PREFIX}")

# Load and process data
df = pd.read_stata(DATA_FILE)
# ... your analysis code here ...

print(f"Analysis completed: {datetime.now()}")
"""

Troubleshooting Common Issues

File Reading Problems

# Handle encoding issues
try:
    df = pd.read_stata('problematic_file.dta')
except UnicodeDecodeError:
    print("Encoding error - trying different encoding")
    df = pd.read_stata('problematic_file.dta', encoding='latin1')

# Handle version compatibility
try:
    df = pd.read_stata('old_format.dta')
except Exception as e:
    print(f"Version issue: {e}")
    # Try reading with preserve_dtypes=False
    df = pd.read_stata('old_format.dta', preserve_dtypes=False)

Writing Problems

# Handle variable name length issues
def prepare_for_stata_export(df):
    """
    Prepare DataFrame for Stata export by handling common issues
    """
    df_copy = df.copy()
    
    # Shorten long variable names
    rename_map = {}
    for col in df_copy.columns:
        if len(col) > 32:
            new_name = col[:32]
            rename_map[col] = new_name
            print(f"Renaming '{col}' to '{new_name}'")
    
    if rename_map:
        df_copy = df_copy.rename(columns=rename_map)
    
    # Handle problematic data types
    for col in df_copy.columns:
        if df_copy[col].dtype == 'object':
            # Check if strings are too long
            max_len = df_copy[col].astype(str).str.len().max()
            if max_len > 2045:  # Stata string limit
                print(f"WARNING: Column '{col}' has strings longer than Stata limit")
                df_copy[col] = df_copy[col].astype(str).str[:2045]
    
    return df_copy

# Use before exporting
df_for_export = prepare_for_stata_export(df)
df_for_export.to_stata('safe_export.dta', write_index=False)

Key Takeaways

Pandas provides excellent support for Stata files with read_stata() and to_stata()
Use convert_dates=True and convert_categoricals=True to preserve Stata metadata
Handle missing values carefully as Stata uses different missing value codes
Most common Stata operations have pandas equivalents
Consider memory optimization for large datasets
Validate data after reading and before writing
Be aware of variable name and string length limitations when exporting

What's Next?

Now that you understand working with Stata files, explore:

Working with Stata files in pandas opens up powerful possibilities for data analysis while maintaining compatibility with existing workflows and collaborators.