Do not use the page_table_lock for COW. This will increase the parallelism in the fault handler for the copy on write situations such as a fork. Patch depends on the following patches having been applied first: make_rss_atomic pte_cmpxchg ptl_drop_first_use no_ptl_do_anon_page Signed-off-by: Christoph Lameter Index: linux-2.6.11/mm/memory.c =================================================================== --- linux-2.6.11.orig/mm/memory.c 2005-03-03 11:29:06.000000000 -0800 +++ linux-2.6.11/mm/memory.c 2005-03-03 11:58:27.000000000 -0800 @@ -1246,21 +1246,6 @@ static inline pte_t maybe_mkwrite(pte_t } /* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - flush_cache_page(vma, address); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); -} - -/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -1344,12 +1329,14 @@ static int do_wp_page(struct mm_struct * copy_user_highpage(new_page, old_page, address); } /* - * Re-check the pte - so far we may not have acquired the - * page_table_lock + * Re-check the pte via a cmpxchg */ - spin_lock(&mm->page_table_lock); + page_table_atomic_start(mm); page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { + flush_cache_page(vma, address); + entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), + vma); + if (ptep_cmpxchg(page_table, pte, entry)) { if (PageAnon(old_page)) update_mm_counter(mm, anon_rss, -1); if (PageReserved(old_page)) { @@ -1358,7 +1345,7 @@ static int do_wp_page(struct mm_struct * update_mem_hiwater(); } else page_remove_rmap(old_page); - break_cow(vma, new_page, address, page_table); + update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1367,7 +1354,7 @@ static int do_wp_page(struct mm_struct * } pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); + page_table_atomic_stop(mm); return VM_FAULT_MINOR; no_new_page: @@ -1715,15 +1702,15 @@ static int do_swap_page(struct mm_struct /* * Back out if somebody else faulted in this pte */ - spin_lock(&mm->page_table_lock); + page_table_atomic_start(mm); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; + page_table_atomic_stop(&mm->page_table_lock); + return ret; } /* Had to read the page from swap area: Major fault */ @@ -1736,53 +1723,39 @@ static int do_swap_page(struct mm_struct lock_page(page); /* - * Back out if somebody else faulted in this pte + * Back out if somebody else faulted this pte */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); - ret = VM_FAULT_MINOR; - goto out; - } - - /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - - update_mm_counter(mm, rss, 1); - acct_update_integrals(); - update_mem_hiwater(); - pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); - page_add_anon_rmap(page, vma, address); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - - if (write_access) { - page_table_atomic_start(mm); - if (do_wp_page(mm, vma, address, + if (ptep_cmpxchg(page_table, orig_pte, pte)) { + unlock_page(page); + + page_add_anon_rmap(page, vma, address); + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + update_mm_counter(mm, rss, 1); + acct_update_integrals(); + update_mem_hiwater(); + if (write_access) { + if (do_wp_page(mm, vma, address, page_table, pmd, pte) == VM_FAULT_OOM) - ret = VM_FAULT_OOM; + return VM_FAULT_OOM; + return ret; + } + } else { + /* Another thread was racing with us an won */ + pte_unmap(page_table); + unlock_page(page); + page_cache_release(page); } - -out: + + page_table_atomic_stop(mm); return ret; }